In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas # Import geopandas for geospatial analysis
import warnings
warnings.filterwarnings('ignore')
# --- Configuration for file paths ---
# Redefine paths within this block to ensure they are always in scope
PROJECT_ROOT = ""
CLEANED_DATA_DIR = os.path.join(PROJECT_ROOT, "Datasets", "Cleaned_Preprocessed")
CLEANED_CSV_FILE = os.path.join(CLEANED_DATA_DIR, "mgnrega_data_cleaned.csv")
FULLY_CLEANED_CSV_FILE = os.path.join(CLEANED_DATA_DIR, "mgnrega_data_fully_cleaned.csv")
GEOJSON_FILE = os.path.join(PROJECT_ROOT, "Datasets", "GeoJSON", "gadm41_IND_2.json")
# Load the dataset
try:
df = pd.read_csv(CLEANED_CSV_FILE)
print("Dataset loaded successfully with updated path!")
except FileNotFoundError:
print(f"Error: The file '{CLEANED_CSV_FILE}' was not found.")
print("Please ensure the project structure matches the image (ML_CA1 as root) and the file name is correct.")
exit()
except Exception as e:
print(f"An unexpected error occurred while loading the dataset: {e}")
exit()
Dataset loaded successfully with updated path!
In [2]:
# Display basic info after reload
print("\n--- DataFrame Info after initial load ---")
df.info()
--- DataFrame Info after initial load --- <class 'pandas.core.frame.DataFrame'> RangeIndex: 302753 entries, 0 to 302752 Data columns (total 36 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fin_year 302752 non-null object 1 month 302752 non-null object 2 state_code 302752 non-null float64 3 State 302752 non-null object 4 district_code 302752 non-null float64 5 District 302752 non-null object 6 Approved_Labour_Budget 302752 non-null float64 7 Average_Wage_rate_per_day_per_person 302752 non-null float64 8 Average_days_of_employment_provided_per_Household 302752 non-null float64 9 Differently_abled_persons_worked 302752 non-null float64 10 Material_and_skilled_Wages 302752 non-null float64 11 Number_of_Completed_Works 302752 non-null float64 12 Number_of_GPs_with_NIL_exp 302752 non-null float64 13 Number_of_Ongoing_Works 302752 non-null float64 14 Persondays_of_Central_Liability_so_far 302752 non-null float64 15 SC_persondays 302752 non-null float64 16 SC_workers_against_active_workers 302752 non-null float64 17 ST_persondays 302752 non-null float64 18 ST_workers_against_active_workers 302752 non-null float64 19 Total_Adm_Expenditure 302752 non-null float64 20 Total_Exp 302752 non-null float64 21 Total_Households_Worked 302752 non-null float64 22 Total_Individuals_Worked 302752 non-null float64 23 Total_No_of_Active_Job_Cards 302752 non-null float64 24 Total_No_of_Active_Workers 302752 non-null float64 25 Total_No_of_HHs_completed_100_Days_of_Wage_Employment 302752 non-null float64 26 Total_No_of_JobCards_issued 302752 non-null float64 27 Total_No_of_Workers 302752 non-null float64 28 Total_No_of_Works_Takenup 302752 non-null float64 29 Wages 302752 non-null float64 30 Women_Persondays 302752 non-null float64 31 percent_of_Category_B_Works 302752 non-null float64 32 percent_of_Expenditure_on_Agriculture_Allied_Works 302752 non-null float64 33 percent_of_NRM_Expenditure 302752 non-null float64 34 percentage_payments_gererated_within_15_days 302752 non-null float64 35 Remarks 1 non-null object dtypes: float64(31), object(5) memory usage: 83.2+ MB
In [3]:
print("\n--- First 3 rows of the DataFrame ---")
print(df.head(3))
--- First 3 rows of the DataFrame ---
fin_year month state_code State district_code District \
0 2019-2020 June 35.0 UTTARAKHAND 3506.0 RUDRA PRAYAG
1 2019-2020 June 35.0 UTTARAKHAND 3508.0 NAINITAL
2 2019-2020 June 35.0 UTTARAKHAND 3512.0 BAGESHWAR
Approved_Labour_Budget Average_Wage_rate_per_day_per_person \
0 323294.0 181.735337
1 252505.0 177.663900
2 241752.0 171.565915
Average_days_of_employment_provided_per_Household \
0 27.0
1 29.0
2 25.0
Differently_abled_persons_worked ... Total_No_of_JobCards_issued \
0 8.0 ... 49606.0
1 21.0 ... 58673.0
2 57.0 ... 48309.0
Total_No_of_Workers Total_No_of_Works_Takenup Wages \
0 89741.0 4422.0 352.164918
1 110623.0 5760.0 546.629180
2 92833.0 2528.0 302.280270
Women_Persondays percent_of_Category_B_Works \
0 114158.0 45.0
1 137899.0 54.0
2 95237.0 33.0
percent_of_Expenditure_on_Agriculture_Allied_Works \
0 60.57
1 82.66
2 77.87
percent_of_NRM_Expenditure percentage_payments_gererated_within_15_days \
0 52.24 100.00
1 67.05 100.02
2 70.21 100.03
Remarks
0 NaN
1 NaN
2 NaN
[3 rows x 36 columns]
In [4]:
print("--- Starting ETL Step 2: Data Type Refinement ---")
# 1. Convert 'fin_year' and 'month' to a proper datetime column
# Indian financial year: April to March
month_mapping = {
'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,
'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12
}
# Ensure 'month' column is treated as string before mapping
df['month_num'] = df['month'].astype(str).map(month_mapping)
# Extract the start year from 'fin_year' (e.g., '2019' from '2019-2020')
# Handle potential NaN or non-string values in 'fin_year'
df['start_year'] = df['fin_year'].apply(lambda x: int(str(x).split('-')[0]) if pd.notna(x) and '-' in str(x) else np.nan)
# Determine the calendar year for each row based on financial year logic
# Months Jan-Mar belong to the second year of the financial year
# Months Apr-Dec belong to the first year of the financial year
def get_calendar_year(row):
if pd.isna(row['start_year']) or pd.isna(row['month_num']):
return np.nan
# Cast to int to ensure correct arithmetic for year
current_start_year = int(row['start_year'])
current_month_num = int(row['month_num'])
if current_month_num >= 4: # April (4) to December (12)
return current_start_year
else: # January (1) to March (3)
return current_start_year + 1
df['calendar_year'] = df.apply(get_calendar_year, axis=1)
# Create the 'Date' column using a specific format for robust parsing
# Convert month_num and calendar_year to int and then string for the date format
df['Date'] = pd.to_datetime(
df['calendar_year'].fillna(0).astype(int).astype(str) + '-' +
df['month_num'].fillna(0).astype(int).astype(str) + '-01',
format='%Y-%m-%d', # Explicitly specify format to avoid UserWarning
errors='coerce' # Coerce errors to NaT
)
# Drop intermediate date columns if they are no longer needed
df.drop(columns=['month_num', 'start_year', 'calendar_year'], inplace=True, errors='ignore')
# 2. Convert 'State' and 'District' to categorical type for memory efficiency
# Handle potential NaN values in 'State' and 'District' before converting to category
# They will be treated as a special category by pandas.
df['State'] = df['State'].astype('category')
df['District'] = df['District'].astype('category')
# 3. Handle 'Remarks' column
print("\n--- Unique values and count for 'Remarks' column before decision ---")
print(df['Remarks'].value_counts(dropna=False))
--- Starting ETL Step 2: Data Type Refinement --- --- Unique values and count for 'Remarks' column before decision --- Remarks NaN 302752 Unable to Fatch Data 1 Name: count, dtype: int64
In [5]:
# Re-evaluating the condition for dropping 'Remarks'
# If it has only NaN and one specific "Unable to Fatch Data" value, and is still mostly NaN
# It might be more beneficial to simply drop it due to its non-informative nature for analysis.
# Let's be more decisive here: if it's almost all NaNs AND the non-NaN values are generic errors.
if df['Remarks'].isnull().sum() / len(df) > 0.99 and df['Remarks'].nunique(dropna=True) <= 1:
df.drop(columns=['Remarks'], inplace=True, errors='ignore')
print("Dropped 'Remarks' column due to high sparsity and non-informative values.")
else:
print("'Remarks' column retained. Further investigation might be needed or it could be useful later.")
# If retained, consider cleaning up the 'Unable to Fatch Data' string if it means NaN
df['Remarks'] = df['Remarks'].replace('Unable to Fatch Data', np.nan)
# 4. Refine 'state_code' and 'district_code' to integer type
# First, ensure there are no NaNs that would prevent int conversion.
# If there are, we'll fill them before converting to int.
print("\n--- Null counts for code columns before integer conversion ---")
print(df[['state_code', 'district_code']].isnull().sum())
Dropped 'Remarks' column due to high sparsity and non-informative values. --- Null counts for code columns before integer conversion --- state_code 1 district_code 1 dtype: int64
In [6]:
# We found 1 NaN in state_code and 1 in district_code from previous output.
# The fillna(-1) ensures they can be converted to int.
if df['state_code'].isnull().any():
df['state_code'] = df['state_code'].fillna(-1).astype(int)
print("Filled NaNs in 'state_code' with -1 and converted to int.")
else:
df['state_code'] = df['state_code'].astype(int)
print("Converted 'state_code' to int.")
if df['district_code'].isnull().any():
df['district_code'] = df['district_code'].fillna(-1).astype(int)
print("Filled NaNs in 'district_code' with -1 and converted to int.")
else:
df['district_code'] = df['district_code'].astype(int)
print("Converted 'district_code' to int.")
# --- Final check after ETL Step 2 ---
print("\n--- DataFrame Info after ETL Step 2 ---")
df.info()
Filled NaNs in 'state_code' with -1 and converted to int. Filled NaNs in 'district_code' with -1 and converted to int. --- DataFrame Info after ETL Step 2 --- <class 'pandas.core.frame.DataFrame'> RangeIndex: 302753 entries, 0 to 302752 Data columns (total 36 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fin_year 302752 non-null object 1 month 302752 non-null object 2 state_code 302753 non-null int64 3 State 302752 non-null category 4 district_code 302753 non-null int64 5 District 302752 non-null category 6 Approved_Labour_Budget 302752 non-null float64 7 Average_Wage_rate_per_day_per_person 302752 non-null float64 8 Average_days_of_employment_provided_per_Household 302752 non-null float64 9 Differently_abled_persons_worked 302752 non-null float64 10 Material_and_skilled_Wages 302752 non-null float64 11 Number_of_Completed_Works 302752 non-null float64 12 Number_of_GPs_with_NIL_exp 302752 non-null float64 13 Number_of_Ongoing_Works 302752 non-null float64 14 Persondays_of_Central_Liability_so_far 302752 non-null float64 15 SC_persondays 302752 non-null float64 16 SC_workers_against_active_workers 302752 non-null float64 17 ST_persondays 302752 non-null float64 18 ST_workers_against_active_workers 302752 non-null float64 19 Total_Adm_Expenditure 302752 non-null float64 20 Total_Exp 302752 non-null float64 21 Total_Households_Worked 302752 non-null float64 22 Total_Individuals_Worked 302752 non-null float64 23 Total_No_of_Active_Job_Cards 302752 non-null float64 24 Total_No_of_Active_Workers 302752 non-null float64 25 Total_No_of_HHs_completed_100_Days_of_Wage_Employment 302752 non-null float64 26 Total_No_of_JobCards_issued 302752 non-null float64 27 Total_No_of_Workers 302752 non-null float64 28 Total_No_of_Works_Takenup 302752 non-null float64 29 Wages 302752 non-null float64 30 Women_Persondays 302752 non-null float64 31 percent_of_Category_B_Works 302752 non-null float64 32 percent_of_Expenditure_on_Agriculture_Allied_Works 302752 non-null float64 33 percent_of_NRM_Expenditure 302752 non-null float64 34 percentage_payments_gererated_within_15_days 302752 non-null float64 35 Date 161331 non-null datetime64[ns] dtypes: category(2), datetime64[ns](1), float64(29), int64(2), object(2) memory usage: 79.4+ MB
In [7]:
print("\n--- First 3 rows of DataFrame after ETL Step 2 ---")
print(df.head(3))
# Save the fully cleaned data
try:
df.to_csv(FULLY_CLEANED_CSV_FILE, index=False)
print(f"\nFully cleaned data saved to: {FULLY_CLEANED_CSV_FILE}")
except Exception as e:
print(f"Error saving fully cleaned data: {e}")
--- First 3 rows of DataFrame after ETL Step 2 ---
fin_year month state_code State district_code District \
0 2019-2020 June 35 UTTARAKHAND 3506 RUDRA PRAYAG
1 2019-2020 June 35 UTTARAKHAND 3508 NAINITAL
2 2019-2020 June 35 UTTARAKHAND 3512 BAGESHWAR
Approved_Labour_Budget Average_Wage_rate_per_day_per_person \
0 323294.0 181.735337
1 252505.0 177.663900
2 241752.0 171.565915
Average_days_of_employment_provided_per_Household \
0 27.0
1 29.0
2 25.0
Differently_abled_persons_worked ... Total_No_of_JobCards_issued \
0 8.0 ... 49606.0
1 21.0 ... 58673.0
2 57.0 ... 48309.0
Total_No_of_Workers Total_No_of_Works_Takenup Wages \
0 89741.0 4422.0 352.164918
1 110623.0 5760.0 546.629180
2 92833.0 2528.0 302.280270
Women_Persondays percent_of_Category_B_Works \
0 114158.0 45.0
1 137899.0 54.0
2 95237.0 33.0
percent_of_Expenditure_on_Agriculture_Allied_Works \
0 60.57
1 82.66
2 77.87
percent_of_NRM_Expenditure percentage_payments_gererated_within_15_days \
0 52.24 100.00
1 67.05 100.02
2 70.21 100.03
Date
0 2019-06-01
1 2019-06-01
2 2019-06-01
[3 rows x 36 columns]
Error saving fully cleaned data: [Errno 13] Permission denied: 'Datasets\\Cleaned_Preprocessed\\mgnrega_data_fully_cleaned.csv'
In [8]:
print("--- Starting ETL Step 3: Debugging Date Column and Final Preprocessing ---")
# --- Re-apply critical conversions from ETL Step 2 to ensure consistency ---
# (This is important if running this cell independently in a new session)
# 1. Convert 'fin_year' and 'month' to a proper datetime column
# Updated month mapping to include abbreviated forms and standardize casing
month_mapping = {
'January': 1, 'Jan': 1,
'February': 2, 'Feb': 2,
'March': 3, 'Mar': 3,
'April': 4, 'Apr': 4,
'May': 5,
'June': 6, 'Jun': 6,
'July': 7, 'Jul': 7,
'August': 8, 'Aug': 8,
'September': 9, 'Sep': 9,
'October': 10, 'Oct': 10,
'November': 11, 'Nov': 11,
'December': 12, 'Dec': 12
}
# Convert 'month' column to string type and then to title case for consistent mapping
df['month_processed'] = df['month'].astype(str).str.title()
df['month_num'] = df['month_processed'].map(month_mapping)
# Drop the intermediate 'month_processed' column
df.drop(columns=['month_processed'], inplace=True, errors='ignore')
# Extract the start year from 'fin_year' (e.g., '2019' from '2019-2020')
# Ensure 'fin_year' is string and handle potential NaNs before splitting
df['start_year'] = df['fin_year'].apply(lambda x: int(str(x).split('-')[0]) if pd.notna(x) and '-' in str(x) else np.nan)
def get_calendar_year(row):
if pd.isna(row['start_year']) or pd.isna(row['month_num']):
return np.nan
current_start_year = int(row['start_year'])
current_month_num = int(row['month_num'])
if current_month_num >= 4: # April (4) to December (12) of the financial year start
return current_start_year
else: # January (1) to March (3) fall into the next calendar year for this financial year
return current_start_year + 1
df['calendar_year'] = df.apply(get_calendar_year, axis=1)
# Create the 'Date' column using explicit format
df['Date'] = pd.to_datetime(
df['calendar_year'].astype(str) + '-' +
df['month_num'].astype(str) + '-01',
format='%Y-%m-%d', # Explicitly specify format for robustness
errors='coerce' # Coerce errors to NaT
)
# Drop intermediate date columns that are no longer needed
df.drop(columns=['month_num', 'start_year', 'calendar_year'], inplace=True, errors='ignore')
# 2. Convert 'State' and 'District' to categorical type (re-applying as safeguard)
df['State'] = df['State'].astype('category')
df['District'] = df['District'].astype('category')
# 3. Handle 'Remarks' column (re-evaluation and final drop)
# Check if 'Remarks' column exists before trying to access it
if 'Remarks' in df.columns:
print("\n--- Unique values and count for 'Remarks' column before final decision ---")
print(df['Remarks'].value_counts(dropna=False))
# Replace specific non-informative string with NaN
df['Remarks'] = df['Remarks'].replace('Unable to Fatch Data', np.nan)
# Now, check if it's entirely NaN after replacement for dropping
if df['Remarks'].isnull().all(): # Check if all values are NaN
df.drop(columns=['Remarks'], inplace=True, errors='ignore')
print("Confirmed 'Remarks' column is entirely NaN and dropped.")
else:
print("'Remarks' column still contains non-NaN values after processing. Retained.")
else:
print("'Remarks' column does not exist in DataFrame. Skipping remarks handling.")
# 4. Refine 'state_code' and 'district_code' to integer type (re-applying as safeguard)
df['state_code'] = df['state_code'].fillna(-1).astype(int)
df['district_code'] = df['district_code'].fillna(-1).astype(int)
# --- Final check after ETL Step 3 ---
print("\n--- DataFrame Info after ETL Step 3 ---")
df.info()
--- Starting ETL Step 3: Debugging Date Column and Final Preprocessing --- 'Remarks' column does not exist in DataFrame. Skipping remarks handling. --- DataFrame Info after ETL Step 3 --- <class 'pandas.core.frame.DataFrame'> RangeIndex: 302753 entries, 0 to 302752 Data columns (total 36 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fin_year 302752 non-null object 1 month 302752 non-null object 2 state_code 302753 non-null int64 3 State 302752 non-null category 4 district_code 302753 non-null int64 5 District 302752 non-null category 6 Approved_Labour_Budget 302752 non-null float64 7 Average_Wage_rate_per_day_per_person 302752 non-null float64 8 Average_days_of_employment_provided_per_Household 302752 non-null float64 9 Differently_abled_persons_worked 302752 non-null float64 10 Material_and_skilled_Wages 302752 non-null float64 11 Number_of_Completed_Works 302752 non-null float64 12 Number_of_GPs_with_NIL_exp 302752 non-null float64 13 Number_of_Ongoing_Works 302752 non-null float64 14 Persondays_of_Central_Liability_so_far 302752 non-null float64 15 SC_persondays 302752 non-null float64 16 SC_workers_against_active_workers 302752 non-null float64 17 ST_persondays 302752 non-null float64 18 ST_workers_against_active_workers 302752 non-null float64 19 Total_Adm_Expenditure 302752 non-null float64 20 Total_Exp 302752 non-null float64 21 Total_Households_Worked 302752 non-null float64 22 Total_Individuals_Worked 302752 non-null float64 23 Total_No_of_Active_Job_Cards 302752 non-null float64 24 Total_No_of_Active_Workers 302752 non-null float64 25 Total_No_of_HHs_completed_100_Days_of_Wage_Employment 302752 non-null float64 26 Total_No_of_JobCards_issued 302752 non-null float64 27 Total_No_of_Workers 302752 non-null float64 28 Total_No_of_Works_Takenup 302752 non-null float64 29 Wages 302752 non-null float64 30 Women_Persondays 302752 non-null float64 31 percent_of_Category_B_Works 302752 non-null float64 32 percent_of_Expenditure_on_Agriculture_Allied_Works 302752 non-null float64 33 percent_of_NRM_Expenditure 302752 non-null float64 34 percentage_payments_gererated_within_15_days 302752 non-null float64 35 Date 0 non-null datetime64[ns] dtypes: category(2), datetime64[ns](1), float64(29), int64(2), object(2) memory usage: 79.4+ MB
In [9]:
print("\n--- First 5 rows of DataFrame after ETL Step 3 ---")
print(df.head())
--- First 5 rows of DataFrame after ETL Step 3 ---
fin_year month state_code State district_code \
0 2019-2020 June 35 UTTARAKHAND 3506
1 2019-2020 June 35 UTTARAKHAND 3508
2 2019-2020 June 35 UTTARAKHAND 3512
3 2019-2020 June 37 LADAKH 3707
4 2019-2020 Aug 14 JAMMU AND KASHMIR 1411
District Approved_Labour_Budget Average_Wage_rate_per_day_per_person \
0 RUDRA PRAYAG 323294.0 181.735337
1 NAINITAL 252505.0 177.663900
2 BAGESHWAR 241752.0 171.565915
3 LEH (LADAKH) 0.0 85814.193438
4 POONCH 640000.0 350.059928
Average_days_of_employment_provided_per_Household \
0 27.0
1 29.0
2 25.0
3 10.0
4 37.0
Differently_abled_persons_worked ... Total_No_of_JobCards_issued \
0 8.0 ... 49606.0
1 21.0 ... 58673.0
2 57.0 ... 48309.0
3 0.0 ... 17284.0
4 76.0 ... 101984.0
Total_No_of_Workers Total_No_of_Works_Takenup Wages \
0 89741.0 4422.0 352.164918
1 110623.0 5760.0 546.629180
2 92833.0 2528.0 302.280270
3 36738.0 1328.0 601.557496
4 207375.0 9755.0 2766.306570
Women_Persondays percent_of_Category_B_Works \
0 114158.0 45.0
1 137899.0 54.0
2 95237.0 33.0
3 476.0 1.0
4 279921.0 5.0
percent_of_Expenditure_on_Agriculture_Allied_Works \
0 60.57
1 82.66
2 77.87
3 21.38
4 19.61
percent_of_NRM_Expenditure percentage_payments_gererated_within_15_days \
0 52.24 100.00
1 67.05 100.02
2 70.21 100.03
3 27.70 0.00
4 48.57 1446.75
Date
0 NaT
1 NaT
2 NaT
3 NaT
4 NaT
[5 rows x 36 columns]
In [10]:
print("\n--- Last 5 rows of DataFrame after ETL Step 3 (to check for NaTs) ---")
print(df.tail())
--- Last 5 rows of DataFrame after ETL Step 3 (to check for NaTs) ---
fin_year month state_code State district_code \
302748 2025-2026 July 2 ANDHRA PRADESH 204
302749 2025-2026 July 2 ANDHRA PRADESH 210
302750 2025-2026 July 30 TRIPURA 3004
302751 2025-2026 July 35 UTTARAKHAND 3510
302752 2025-2026 July 35 UTTARAKHAND 3511
District Approved_Labour_Budget \
302748 EAST GODAVARI 2375101.0
302749 CHITTOOR 2940884.0
302750 DHALAI 2705200.0
302751 CHAMPAWAT 285414.0
302752 PITHORAGARH 467827.0
Average_Wage_rate_per_day_per_person \
302748 275.725157
302749 272.479654
302750 225.922068
302751 256.665568
302752 264.807263
Average_days_of_employment_provided_per_Household \
302748 37.0
302749 31.0
302750 27.0
302751 22.0
302752 21.0
Differently_abled_persons_worked ... Total_No_of_JobCards_issued \
302748 1585.0 ... 170297.0
302749 960.0 ... 276499.0
302750 1051.0 ... 99314.0
302751 35.0 ... 40001.0
302752 14.0 ... 76646.0
Total_No_of_Workers Total_No_of_Works_Takenup Wages \
302748 262576.0 33029.0 7980.073332
302749 496200.0 84708.0 7631.195986
302750 175811.0 11257.0 5290.116600
302751 64271.0 7349.0 404.163570
302752 151313.0 12416.0 482.976670
Women_Persondays percent_of_Category_B_Works \
302748 1625510.0 47.0
302749 1745220.0 70.0
302750 1172761.0 87.0
302751 73670.0 44.0
302752 90627.0 48.0
percent_of_Expenditure_on_Agriculture_Allied_Works \
302748 45.91
302749 63.60
302750 51.61
302751 82.46
302752 79.75
percent_of_NRM_Expenditure \
302748 89.22
302749 83.43
302750 58.11
302751 76.24
302752 55.53
percentage_payments_gererated_within_15_days Date
302748 100.00 NaT
302749 100.00 NaT
302750 100.05 NaT
302751 100.00 NaT
302752 100.01 NaT
[5 rows x 36 columns]
In [11]:
# Check a sample of rows where Date is NaT
print("\n--- Sample of rows where 'Date' is NaT (fin_year, month, Date columns) ---")
print(df[df['Date'].isna()][['fin_year', 'month', 'Date']].head(10))
--- Sample of rows where 'Date' is NaT (fin_year, month, Date columns) ---
fin_year month Date
0 2019-2020 June NaT
1 2019-2020 June NaT
2 2019-2020 June NaT
3 2019-2020 June NaT
4 2019-2020 Aug NaT
5 2019-2020 Aug NaT
6 2019-2020 Aug NaT
7 2019-2020 Aug NaT
8 2019-2020 Aug NaT
9 2019-2020 Aug NaT
In [12]:
# Count final NaNs across all columns
print("\n--- Final Null Counts per Column after ETL Step 3 ---")
print(df.isnull().sum())
--- Final Null Counts per Column after ETL Step 3 --- fin_year 1 month 1 state_code 0 State 1 district_code 0 District 1 Approved_Labour_Budget 1 Average_Wage_rate_per_day_per_person 1 Average_days_of_employment_provided_per_Household 1 Differently_abled_persons_worked 1 Material_and_skilled_Wages 1 Number_of_Completed_Works 1 Number_of_GPs_with_NIL_exp 1 Number_of_Ongoing_Works 1 Persondays_of_Central_Liability_so_far 1 SC_persondays 1 SC_workers_against_active_workers 1 ST_persondays 1 ST_workers_against_active_workers 1 Total_Adm_Expenditure 1 Total_Exp 1 Total_Households_Worked 1 Total_Individuals_Worked 1 Total_No_of_Active_Job_Cards 1 Total_No_of_Active_Workers 1 Total_No_of_HHs_completed_100_Days_of_Wage_Employment 1 Total_No_of_JobCards_issued 1 Total_No_of_Workers 1 Total_No_of_Works_Takenup 1 Wages 1 Women_Persondays 1 percent_of_Category_B_Works 1 percent_of_Expenditure_on_Agriculture_Allied_Works 1 percent_of_NRM_Expenditure 1 percentage_payments_gererated_within_15_days 1 Date 302753 dtype: int64
In [13]:
# Save the fully cleaned data
try:
df.to_csv(FULLY_CLEANED_CSV_FILE, index=False)
print(f"\nFully cleaned data saved to: {FULLY_CLEANED_CSV_FILE}")
except Exception as e:
print(f"Error saving fully cleaned data: {e}")
Error saving fully cleaned data: [Errno 13] Permission denied: 'Datasets\\Cleaned_Preprocessed\\mgnrega_data_fully_cleaned.csv'
In [14]:
print("--- Starting ETL Step 4: Debugging Date Parsing and Final Missing Value Handling ---")
# --- Re-apply essential conversions for consistency if this cell is run independently ---
# These ensure that 'state_code', 'district_code', 'State', 'District' are correct
df['state_code'] = df['state_code'].fillna(-1).astype(int)
df['district_code'] = df['district_code'].fillna(-1).astype(int)
df['State'] = df['State'].astype('category')
df['District'] = df['District'].astype('category')
# Drop 'Remarks' column if it exists and is non-informative
if 'Remarks' in df.columns:
df['Remarks'] = df['Remarks'].replace('Unable to Fatch Data', np.nan)
if df['Remarks'].isnull().all():
df.drop(columns=['Remarks'], inplace=True, errors='ignore')
print("Confirmed 'Remarks' column is entirely NaN and dropped.")
else:
print("'Remarks' column still contains non-NaN values after processing. Retained.")
else:
print("'Remarks' column does not exist in DataFrame. Skipping remarks handling.")
# --- Debugging Date Column Issue ---
# 1. Investigate unique values in the 'month' column directly, ignoring case and stripping spaces
print("\n--- Unique values in original 'month' column (normalized) ---")
normalized_months = df['month'].astype(str).str.strip().str.capitalize().unique()
print(normalized_months)
--- Starting ETL Step 4: Debugging Date Parsing and Final Missing Value Handling --- 'Remarks' column does not exist in DataFrame. Skipping remarks handling. --- Unique values in original 'month' column (normalized) --- ['June' 'Aug' 'April' 'March' 'Feb' 'Jan' 'Nov' 'July' 'May' 'Sep' 'Dec' 'Oct' 'Nan']
In [15]:
# Update month_mapping based on observed unique values, if necessary.
# Based on the previous output, months were like 'June', 'Aug', 'Sep' (already capitalized).
# The current mapping should largely cover these. Let's ensure no hidden chars.
month_mapping = {
'January': 1, 'Jan': 1,
'February': 2, 'Feb': 2,
'March': 3, 'Mar': 3,
'April': 4, 'Apr': 4,
'May': 5,
'June': 6, 'Jun': 6,
'July': 7, 'Jul': 7,
'August': 8, 'Aug': 8,
'September': 9, 'Sep': 9,
'October': 10, 'Oct': 10,
'November': 11, 'Nov': 11,
'December': 12, 'Dec': 12
}
# Apply mapping after stripping whitespace and capitalizing to ensure consistent keys for mapping
df['month_num'] = df['month'].astype(str).str.strip().str.capitalize().map(month_mapping)
# Check for remaining NaNs in month_num after this more robust mapping
print("\n--- Null counts for 'month_num' after improved mapping ---")
print(df['month_num'].isnull().sum())
--- Null counts for 'month_num' after improved mapping --- 1
In [16]:
# If there are still NaNs, let's find the original 'month' values that didn't map
unmapped_months = df[df['month_num'].isna()]['month'].astype(str).str.strip().str.capitalize().unique()
if len(unmapped_months) > 1 or (len(unmapped_months) == 1 and unmapped_months[0] != 'Nan'): # 'Nan' is the string representation of numpy.nan
print(f"\n--- Unmapped month values found: {unmapped_months} ---")
# This indicates some unexpected string in 'month'.
# For now, we'll keep the NaNs from these unmapped months, as they are genuinely unparseable.
# If the unmapped month is just 'Nan', it means only the original NaN month caused issues.
# Otherwise, it means there are actual strings like 'N/A' or '?' that need adding to month_mapping or replacement with NaN.
# In this dataset, we found `NaN` as the only unmapped value, implying the map itself is fine,
# and the single original `month` NaN is the root cause.
# Extract the start year from 'fin_year'
df['start_year'] = df['fin_year'].apply(lambda x: int(str(x).split('-')[0]) if pd.notna(x) and isinstance(x, str) and '-' in x else np.nan)
# Function to determine the calendar year, now with more robust NaN handling within `pd.to_datetime`
def get_calendar_year(row):
# If either start_year or month_num is NaN, the calendar year cannot be determined.
if pd.isna(row['start_year']) or pd.isna(row['month_num']):
return np.nan
current_start_year = int(row['start_year'])
current_month_num = int(row['month_num'])
if current_month_num >= 4: # April (4) to December (12) of the financial year's first calendar year
return current_start_year
else: # January (1) to March (3) of the financial year's second calendar year
return current_start_year + 1
df['calendar_year'] = df.apply(get_calendar_year, axis=1)
# Create the 'Date' column
# We are already coercing errors to NaT, so if `calendar_year` or `month_num` are NaN,
# the resulting date will be NaT. This is expected.
df['Date'] = pd.to_datetime(
df['calendar_year'].astype(str) + '-' +
df['month_num'].astype(str) + '-01',
format='%Y-%m-%d',
errors='coerce' # Keep coerce errors, as we want NaT if year/month combination is invalid
)
# Drop intermediate date columns
df.drop(columns=['month_num', 'start_year', 'calendar_year'], inplace=True, errors='ignore')
# --- Handle remaining small null counts ---
# From previous output, most columns have 1 NaN. This is likely the single row that had
# NaN in 'fin_year' and 'month'. Let's identify and drop that row.
initial_rows = len(df)
df.dropna(subset=['fin_year', 'month'], inplace=True) # Dropping rows where fin_year or month are NaN
print(f"\nDropped {initial_rows - len(df)} rows due to NaN in 'fin_year' or 'month'.")
# Final check for NaNs across all columns after dropping the problematic row
print("\n--- Final Null Counts per Column after ETL Step 4 ---")
print(df.isnull().sum())
Dropped 1 rows due to NaN in 'fin_year' or 'month'. --- Final Null Counts per Column after ETL Step 4 --- fin_year 0 month 0 state_code 0 State 0 district_code 0 District 0 Approved_Labour_Budget 0 Average_Wage_rate_per_day_per_person 0 Average_days_of_employment_provided_per_Household 0 Differently_abled_persons_worked 0 Material_and_skilled_Wages 0 Number_of_Completed_Works 0 Number_of_GPs_with_NIL_exp 0 Number_of_Ongoing_Works 0 Persondays_of_Central_Liability_so_far 0 SC_persondays 0 SC_workers_against_active_workers 0 ST_persondays 0 ST_workers_against_active_workers 0 Total_Adm_Expenditure 0 Total_Exp 0 Total_Households_Worked 0 Total_Individuals_Worked 0 Total_No_of_Active_Job_Cards 0 Total_No_of_Active_Workers 0 Total_No_of_HHs_completed_100_Days_of_Wage_Employment 0 Total_No_of_JobCards_issued 0 Total_No_of_Workers 0 Total_No_of_Works_Takenup 0 Wages 0 Women_Persondays 0 percent_of_Category_B_Works 0 percent_of_Expenditure_on_Agriculture_Allied_Works 0 percent_of_NRM_Expenditure 0 percentage_payments_gererated_within_15_days 0 Date 302752 dtype: int64
In [17]:
# --- Final check after ETL Step 4 ---
print("\n--- DataFrame Info after ETL Step 4 ---")
df.info()
--- DataFrame Info after ETL Step 4 --- <class 'pandas.core.frame.DataFrame'> Index: 302752 entries, 0 to 302752 Data columns (total 36 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fin_year 302752 non-null object 1 month 302752 non-null object 2 state_code 302752 non-null int64 3 State 302752 non-null category 4 district_code 302752 non-null int64 5 District 302752 non-null category 6 Approved_Labour_Budget 302752 non-null float64 7 Average_Wage_rate_per_day_per_person 302752 non-null float64 8 Average_days_of_employment_provided_per_Household 302752 non-null float64 9 Differently_abled_persons_worked 302752 non-null float64 10 Material_and_skilled_Wages 302752 non-null float64 11 Number_of_Completed_Works 302752 non-null float64 12 Number_of_GPs_with_NIL_exp 302752 non-null float64 13 Number_of_Ongoing_Works 302752 non-null float64 14 Persondays_of_Central_Liability_so_far 302752 non-null float64 15 SC_persondays 302752 non-null float64 16 SC_workers_against_active_workers 302752 non-null float64 17 ST_persondays 302752 non-null float64 18 ST_workers_against_active_workers 302752 non-null float64 19 Total_Adm_Expenditure 302752 non-null float64 20 Total_Exp 302752 non-null float64 21 Total_Households_Worked 302752 non-null float64 22 Total_Individuals_Worked 302752 non-null float64 23 Total_No_of_Active_Job_Cards 302752 non-null float64 24 Total_No_of_Active_Workers 302752 non-null float64 25 Total_No_of_HHs_completed_100_Days_of_Wage_Employment 302752 non-null float64 26 Total_No_of_JobCards_issued 302752 non-null float64 27 Total_No_of_Workers 302752 non-null float64 28 Total_No_of_Works_Takenup 302752 non-null float64 29 Wages 302752 non-null float64 30 Women_Persondays 302752 non-null float64 31 percent_of_Category_B_Works 302752 non-null float64 32 percent_of_Expenditure_on_Agriculture_Allied_Works 302752 non-null float64 33 percent_of_NRM_Expenditure 302752 non-null float64 34 percentage_payments_gererated_within_15_days 302752 non-null float64 35 Date 0 non-null datetime64[ns] dtypes: category(2), datetime64[ns](1), float64(29), int64(2), object(2) memory usage: 81.7+ MB
In [18]:
print("\n--- First 5 rows of DataFrame after ETL Step 4 ---")
print(df.head())
--- First 5 rows of DataFrame after ETL Step 4 ---
fin_year month state_code State district_code \
0 2019-2020 June 35 UTTARAKHAND 3506
1 2019-2020 June 35 UTTARAKHAND 3508
2 2019-2020 June 35 UTTARAKHAND 3512
3 2019-2020 June 37 LADAKH 3707
4 2019-2020 Aug 14 JAMMU AND KASHMIR 1411
District Approved_Labour_Budget Average_Wage_rate_per_day_per_person \
0 RUDRA PRAYAG 323294.0 181.735337
1 NAINITAL 252505.0 177.663900
2 BAGESHWAR 241752.0 171.565915
3 LEH (LADAKH) 0.0 85814.193438
4 POONCH 640000.0 350.059928
Average_days_of_employment_provided_per_Household \
0 27.0
1 29.0
2 25.0
3 10.0
4 37.0
Differently_abled_persons_worked ... Total_No_of_JobCards_issued \
0 8.0 ... 49606.0
1 21.0 ... 58673.0
2 57.0 ... 48309.0
3 0.0 ... 17284.0
4 76.0 ... 101984.0
Total_No_of_Workers Total_No_of_Works_Takenup Wages \
0 89741.0 4422.0 352.164918
1 110623.0 5760.0 546.629180
2 92833.0 2528.0 302.280270
3 36738.0 1328.0 601.557496
4 207375.0 9755.0 2766.306570
Women_Persondays percent_of_Category_B_Works \
0 114158.0 45.0
1 137899.0 54.0
2 95237.0 33.0
3 476.0 1.0
4 279921.0 5.0
percent_of_Expenditure_on_Agriculture_Allied_Works \
0 60.57
1 82.66
2 77.87
3 21.38
4 19.61
percent_of_NRM_Expenditure percentage_payments_gererated_within_15_days \
0 52.24 100.00
1 67.05 100.02
2 70.21 100.03
3 27.70 0.00
4 48.57 1446.75
Date
0 NaT
1 NaT
2 NaT
3 NaT
4 NaT
[5 rows x 36 columns]
In [19]:
print("\n--- Last 5 rows of DataFrame after ETL Step 4 ---")
print(df.tail())
--- Last 5 rows of DataFrame after ETL Step 4 ---
fin_year month state_code State district_code \
302748 2025-2026 July 2 ANDHRA PRADESH 204
302749 2025-2026 July 2 ANDHRA PRADESH 210
302750 2025-2026 July 30 TRIPURA 3004
302751 2025-2026 July 35 UTTARAKHAND 3510
302752 2025-2026 July 35 UTTARAKHAND 3511
District Approved_Labour_Budget \
302748 EAST GODAVARI 2375101.0
302749 CHITTOOR 2940884.0
302750 DHALAI 2705200.0
302751 CHAMPAWAT 285414.0
302752 PITHORAGARH 467827.0
Average_Wage_rate_per_day_per_person \
302748 275.725157
302749 272.479654
302750 225.922068
302751 256.665568
302752 264.807263
Average_days_of_employment_provided_per_Household \
302748 37.0
302749 31.0
302750 27.0
302751 22.0
302752 21.0
Differently_abled_persons_worked ... Total_No_of_JobCards_issued \
302748 1585.0 ... 170297.0
302749 960.0 ... 276499.0
302750 1051.0 ... 99314.0
302751 35.0 ... 40001.0
302752 14.0 ... 76646.0
Total_No_of_Workers Total_No_of_Works_Takenup Wages \
302748 262576.0 33029.0 7980.073332
302749 496200.0 84708.0 7631.195986
302750 175811.0 11257.0 5290.116600
302751 64271.0 7349.0 404.163570
302752 151313.0 12416.0 482.976670
Women_Persondays percent_of_Category_B_Works \
302748 1625510.0 47.0
302749 1745220.0 70.0
302750 1172761.0 87.0
302751 73670.0 44.0
302752 90627.0 48.0
percent_of_Expenditure_on_Agriculture_Allied_Works \
302748 45.91
302749 63.60
302750 51.61
302751 82.46
302752 79.75
percent_of_NRM_Expenditure \
302748 89.22
302749 83.43
302750 58.11
302751 76.24
302752 55.53
percentage_payments_gererated_within_15_days Date
302748 100.00 NaT
302749 100.00 NaT
302750 100.05 NaT
302751 100.00 NaT
302752 100.01 NaT
[5 rows x 36 columns]
In [20]:
# Check a sample of rows where Date is NaT (should be zero now)
print("\n--- Sample of rows where 'Date' is NaT (fin_year, month, Date columns) ---")
print(df[df['Date'].isna()][['fin_year', 'month', 'Date']].head(10)) # This should ideally be empty
--- Sample of rows where 'Date' is NaT (fin_year, month, Date columns) ---
fin_year month Date
0 2019-2020 June NaT
1 2019-2020 June NaT
2 2019-2020 June NaT
3 2019-2020 June NaT
4 2019-2020 Aug NaT
5 2019-2020 Aug NaT
6 2019-2020 Aug NaT
7 2019-2020 Aug NaT
8 2019-2020 Aug NaT
9 2019-2020 Aug NaT
In [21]:
# Save the fully cleaned data
try:
df.to_csv(FULLY_CLEANED_CSV_FILE, index=False)
print(f"\nFully cleaned data saved to: {FULLY_CLEANED_CSV_FILE}")
except Exception as e:
print(f"Error saving fully cleaned data: {e}")
Error saving fully cleaned data: [Errno 13] Permission denied: 'Datasets\\Cleaned_Preprocessed\\mgnrega_data_fully_cleaned.csv'
In [22]:
print("\n--- Starting EDA Step 1: Descriptive Statistics for Numerical Features ---")
# Select only numerical columns for descriptive statistics
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
# Generate comprehensive descriptive statistics
desc_stats = df[numerical_cols].describe().T
print("\n--- Comprehensive Descriptive Statistics for Numerical Columns ---")
print(desc_stats)
--- Starting EDA Step 1: Descriptive Statistics for Numerical Features ---
--- Comprehensive Descriptive Statistics for Numerical Columns ---
count mean \
state_code 302752.0 2.025146e+01
district_code 302752.0 2.043824e+03
Approved_Labour_Budget 302752.0 2.075257e+06
Average_Wage_rate_per_day_per_person 302752.0 7.855366e+03
Average_days_of_employment_provided_per_Household 302752.0 2.820472e+01
Differently_abled_persons_worked 302752.0 4.547516e+02
Material_and_skilled_Wages 302752.0 1.605630e+03
Number_of_Completed_Works 302752.0 5.093862e+03
Number_of_GPs_with_NIL_exp 302752.0 4.089783e+01
Number_of_Ongoing_Works 302752.0 1.660017e+04
Persondays_of_Central_Liability_so_far 302752.0 1.892556e+06
SC_persondays 302752.0 3.566862e+05
SC_workers_against_active_workers 302752.0 4.253325e+04
ST_persondays 302752.0 3.494912e+05
ST_workers_against_active_workers 302752.0 3.779081e+04
Total_Adm_Expenditure 302752.0 2.103804e+02
Total_Exp 302752.0 6.431948e+03
Total_Households_Worked 302752.0 5.327019e+04
Total_Individuals_Worked 302752.0 7.267501e+04
Total_No_of_Active_Job_Cards 302752.0 1.207948e+05
Total_No_of_Active_Workers 302752.0 2.172280e+05
Total_No_of_HHs_completed_100_Days_of_Wage_Empl... 302752.0 1.158992e+03
Total_No_of_JobCards_issued 302752.0 2.052138e+05
Total_No_of_Workers 302752.0 3.591445e+05
Total_No_of_Works_Takenup 302752.0 2.169403e+04
Wages 302752.0 4.615937e+03
Women_Persondays 302752.0 1.081912e+06
percent_of_Category_B_Works 302752.0 5.350844e+01
percent_of_Expenditure_on_Agriculture_Allied_Works 302752.0 4.547593e+01
percent_of_NRM_Expenditure 302752.0 3.568247e+01
percentage_payments_gererated_within_15_days 302752.0 1.310342e+03
std min \
state_code 1.053332e+01 1.0
district_code 1.056419e+03 101.0
Approved_Labour_Budget 2.509155e+06 0.0
Average_Wage_rate_per_day_per_person 2.757737e+05 0.0
Average_days_of_employment_provided_per_Household 1.466575e+01 0.0
Differently_abled_persons_worked 1.014414e+03 0.0
Material_and_skilled_Wages 2.801585e+03 0.0
Number_of_Completed_Works 8.215692e+03 0.0
Number_of_GPs_with_NIL_exp 1.031464e+02 0.0
Number_of_Ongoing_Works 1.748567e+04 0.0
Persondays_of_Central_Liability_so_far 2.659581e+06 0.0
SC_persondays 6.361468e+05 0.0
SC_workers_against_active_workers 5.268318e+04 0.0
ST_persondays 9.248198e+05 0.0
ST_workers_against_active_workers 6.749146e+04 0.0
Total_Adm_Expenditure 3.033630e+02 0.0
Total_Exp 8.855501e+03 0.0
Total_Households_Worked 6.071137e+04 0.0
Total_Individuals_Worked 8.655784e+04 0.0
Total_No_of_Active_Job_Cards 9.988319e+04 0.0
Total_No_of_Active_Workers 1.888368e+05 0.0
Total_No_of_HHs_completed_100_Days_of_Wage_Empl... 4.112897e+03 0.0
Total_No_of_JobCards_issued 1.726795e+05 2548.0
Total_No_of_Workers 3.186762e+05 2734.0
Total_No_of_Works_Takenup 2.225143e+04 2.0
Wages 6.291311e+03 0.0
Women_Persondays 1.723049e+06 0.0
percent_of_Category_B_Works 2.755359e+01 0.0
percent_of_Expenditure_on_Agriculture_Allied_Works 2.393623e+01 0.0
percent_of_NRM_Expenditure 2.797142e+01 0.0
percentage_payments_gererated_within_15_days 2.278485e+05 0.0
25% \
state_code 12.000000
district_code 1218.000000
Approved_Labour_Budget 448161.000000
Average_Wage_rate_per_day_per_person 223.402093
Average_days_of_employment_provided_per_Household 17.000000
Differently_abled_persons_worked 23.000000
Material_and_skilled_Wages 17.810320
Number_of_Completed_Works 499.000000
Number_of_GPs_with_NIL_exp 0.000000
Number_of_Ongoing_Works 4795.000000
Persondays_of_Central_Liability_so_far 230420.500000
SC_persondays 11467.000000
SC_workers_against_active_workers 7484.000000
ST_persondays 1938.000000
ST_workers_against_active_workers 1084.000000
Total_Adm_Expenditure 6.813650
Total_Exp 848.671839
Total_Households_Worked 10951.500000
Total_Individuals_Worked 13851.750000
Total_No_of_Active_Job_Cards 46663.000000
Total_No_of_Active_Workers 84096.500000
Total_No_of_HHs_completed_100_Days_of_Wage_Empl... 0.000000
Total_No_of_JobCards_issued 82806.250000
Total_No_of_Workers 138738.000000
Total_No_of_Works_Takenup 6496.000000
Wages 624.374586
Women_Persondays 112228.750000
percent_of_Category_B_Works 33.000000
percent_of_Expenditure_on_Agriculture_Allied_Works 27.400000
percent_of_NRM_Expenditure 10.590000
percentage_payments_gererated_within_15_days 99.930000
50% \
state_code 2.000000e+01
district_code 2.012000e+03
Approved_Labour_Budget 1.254692e+06
Average_Wage_rate_per_day_per_person 2.508062e+02
Average_days_of_employment_provided_per_Household 2.800000e+01
Differently_abled_persons_worked 1.120000e+02
Material_and_skilled_Wages 4.532738e+02
Number_of_Completed_Works 2.192000e+03
Number_of_GPs_with_NIL_exp 3.000000e+00
Number_of_Ongoing_Works 1.178500e+04
Persondays_of_Central_Liability_so_far 9.219625e+05
SC_persondays 1.094760e+05
SC_workers_against_active_workers 2.855500e+04
ST_persondays 4.562900e+04
ST_workers_against_active_workers 1.110500e+04
Total_Adm_Expenditure 9.095982e+01
Total_Exp 3.187959e+03
Total_Households_Worked 3.420400e+04
Total_Individuals_Worked 4.379350e+04
Total_No_of_Active_Job_Cards 9.991100e+04
Total_No_of_Active_Workers 1.742250e+05
Total_No_of_HHs_completed_100_Days_of_Wage_Empl... 3.300000e+01
Total_No_of_JobCards_issued 1.630680e+05
Total_No_of_Workers 2.835330e+05
Total_No_of_Works_Takenup 1.591900e+04
Wages 2.329798e+03
Women_Persondays 4.581500e+05
percent_of_Category_B_Works 5.800000e+01
percent_of_Expenditure_on_Agriculture_Allied_Works 4.485000e+01
percent_of_NRM_Expenditure 2.979000e+01
percentage_payments_gererated_within_15_days 1.000000e+02
75% max
state_code 3.100000e+01 3.700000e+01
district_code 3.110000e+03 3.708000e+03
Approved_Labour_Budget 2.792038e+06 5.049829e+07
Average_Wage_rate_per_day_per_person 2.927669e+02 4.439217e+07
Average_days_of_employment_provided_per_Household 3.800000e+01 1.050000e+02
Differently_abled_persons_worked 4.040000e+02 2.647000e+04
Material_and_skilled_Wages 2.057956e+03 6.307132e+04
Number_of_Completed_Works 6.415000e+03 2.011390e+05
Number_of_GPs_with_NIL_exp 2.800000e+01 1.715000e+03
Number_of_Ongoing_Works 2.322125e+04 3.407190e+05
Persondays_of_Central_Liability_so_far 2.505545e+06 5.047109e+07
SC_persondays 4.384945e+05 1.646966e+07
SC_workers_against_active_workers 6.010350e+04 7.184840e+05
ST_persondays 3.157938e+05 1.919524e+07
ST_workers_against_active_workers 4.609525e+04 7.592320e+05
Total_Adm_Expenditure 2.814134e+02 3.529930e+03
Total_Exp 8.468628e+03 1.670081e+05
Total_Households_Worked 7.372100e+04 8.305860e+05
Total_Individuals_Worked 9.978025e+04 1.222688e+06
Total_No_of_Active_Job_Cards 1.625052e+05 1.063539e+06
Total_No_of_Active_Workers 2.976880e+05 2.124537e+06
Total_No_of_HHs_completed_100_Days_of_Wage_Empl... 5.050000e+02 1.690930e+05
Total_No_of_JobCards_issued 2.796300e+05 1.749620e+06
Total_No_of_Workers 4.906972e+05 3.292845e+06
Total_No_of_Works_Takenup 3.039200e+04 4.866350e+05
Wages 6.064413e+03 1.027751e+05
Women_Persondays 1.312826e+06 2.635214e+07
percent_of_Category_B_Works 7.600000e+01 9.900000e+01
percent_of_Expenditure_on_Agriculture_Allied_Works 6.288000e+01 1.000000e+02
percent_of_NRM_Expenditure 5.871000e+01 1.000000e+02
percentage_payments_gererated_within_15_days 1.005800e+02 8.403151e+07
In [23]:
# Beyond standard statistics: check skewness and kurtosis to understand distribution shape
# Skewness: measures asymmetry of the distribution (positive = right-skewed, negative = left-skewed)
# Kurtosis: measures "tailedness" of the distribution (positive = heavy tails, negative = light tails)
skewness = df[numerical_cols].skew()
kurtosis = df[numerical_cols].kurt()
# Combine into a single DataFrame for easier viewing
distribution_stats = pd.DataFrame({
'skew': skewness,
'kurtosis': kurtosis
})
print("\n--- Skewness and Kurtosis for Numerical Columns ---")
print(distribution_stats)
--- Skewness and Kurtosis for Numerical Columns ---
skew kurtosis
state_code -0.218060 -1.165571
district_code -0.212957 -1.168488
Approved_Labour_Budget 3.011025 15.445552
Average_Wage_rate_per_day_per_person 71.856279 6842.353773
Average_days_of_employment_provided_per_Household 0.347391 0.316804
Differently_abled_persons_worked 5.868201 57.523738
Material_and_skilled_Wages 4.045168 30.108224
Number_of_Completed_Works 4.711448 42.322876
Number_of_GPs_with_NIL_exp 4.763743 31.107063
Number_of_Ongoing_Works 3.050894 21.057630
Persondays_of_Central_Liability_so_far 3.177432 16.640098
SC_persondays 4.564411 40.222910
SC_workers_against_active_workers 3.766103 26.161669
ST_persondays 7.627024 83.260574
ST_workers_against_active_workers 4.091767 23.739848
Total_Adm_Expenditure 2.597227 9.552012
Total_Exp 3.011238 14.577952
Total_Households_Worked 2.360042 8.664987
Total_Individuals_Worked 2.474435 9.418968
Total_No_of_Active_Job_Cards 1.789309 5.506648
Total_No_of_Active_Workers 2.201306 8.762560
Total_No_of_HHs_completed_100_Days_of_Wage_Empl... 9.762769 170.542505
Total_No_of_JobCards_issued 1.822272 5.407297
Total_No_of_Workers 2.079651 6.832541
Total_No_of_Works_Takenup 3.114751 23.748147
Wages 2.917746 12.943180
Women_Persondays 3.665327 19.392907
percent_of_Category_B_Works -0.392741 -0.883941
percent_of_Expenditure_on_Agriculture_Allied_Works 0.095354 -0.693425
percent_of_NRM_Expenditure 0.447483 -1.010623
percentage_payments_gererated_within_15_days 336.330202 122001.844511
In [24]:
# Further inspect potential issues like zero values or extremely large values
# that might indicate data entry errors or outliers that need special attention.
# Count zero values for columns where zero might be abnormal (e.g., wages, persondays)
zero_counts = {}
for col in numerical_cols:
if df[col].dtype in ['int64', 'float64']:
num_zeros = (df[col] == 0).sum()
if num_zeros > 0:
zero_counts[col] = num_zeros
if zero_counts:
print("\n--- Count of Zero Values in Numerical Columns ---")
for col, count in zero_counts.items():
print(f"'{col}': {count} zeros ({count/len(df)*100:.2f}%)")
else:
print("\nNo significant zero values found in numerical columns.")
# Identify potential outliers using IQR method for a few key columns (example)
# This is a preliminary check. Outlier handling comes later.
key_cols_for_outlier_check = [
'Approved_Labour_Budget', 'Total_Exp', 'Wages',
'Total_No_of_Workers', 'Average_Wage_rate_per_day_per_person'
]
print("\n--- Potential Outlier Check (using IQR method, top/bottom 5 values) ---")
for col in key_cols_for_outlier_check:
if col in df.columns:
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers_low = df[df[col] < lower_bound][col].sort_values(ascending=True)
outliers_high = df[df[col] > upper_bound][col].sort_values(ascending=False)
print(f"\n--- Column: {col} ---")
if not outliers_low.empty:
print(f" Lower outliers (below {lower_bound:.2f}):\n{outliers_low.head()}")
if not outliers_high.empty:
print(f" Upper outliers (above {upper_bound:.2f}):\n{outliers_high.head()}")
if outliers_low.empty and outliers_high.empty:
print(f" No clear outliers detected by IQR for {col}.")
else:
print(f"Column '{col}' not found for outlier check.")
--- Count of Zero Values in Numerical Columns --- 'Approved_Labour_Budget': 4141 zeros (1.37%) 'Average_Wage_rate_per_day_per_person': 10128 zeros (3.35%) 'Average_days_of_employment_provided_per_Household': 9523 zeros (3.15%) 'Differently_abled_persons_worked': 22738 zeros (7.51%) 'Material_and_skilled_Wages': 48383 zeros (15.98%) 'Number_of_Completed_Works': 7321 zeros (2.42%) 'Number_of_GPs_with_NIL_exp': 109856 zeros (36.29%) 'Number_of_Ongoing_Works': 3 zeros (0.00%) 'Persondays_of_Central_Liability_so_far': 9523 zeros (3.15%) 'SC_persondays': 26824 zeros (8.86%) 'SC_workers_against_active_workers': 13664 zeros (4.51%) 'ST_persondays': 25923 zeros (8.56%) 'ST_workers_against_active_workers': 12135 zeros (4.01%) 'Total_Adm_Expenditure': 59910 zeros (19.79%) 'Total_Exp': 1185 zeros (0.39%) 'Total_Households_Worked': 9523 zeros (3.15%) 'Total_Individuals_Worked': 9523 zeros (3.15%) 'Total_No_of_Active_Job_Cards': 60 zeros (0.02%) 'Total_No_of_Active_Workers': 60 zeros (0.02%) 'Total_No_of_HHs_completed_100_Days_of_Wage_Employment': 96008 zeros (31.71%) 'Wages': 4834 zeros (1.60%) 'Women_Persondays': 9740 zeros (3.22%) 'percent_of_Category_B_Works': 14601 zeros (4.82%) 'percent_of_Expenditure_on_Agriculture_Allied_Works': 10350 zeros (3.42%) 'percent_of_NRM_Expenditure': 14528 zeros (4.80%) 'percentage_payments_gererated_within_15_days': 12390 zeros (4.09%) --- Potential Outlier Check (using IQR method, top/bottom 5 values) --- --- Column: Approved_Labour_Budget --- Upper outliers (above 6307852.25): 183283 50498292.0 249508 45673162.0 182200 44049286.0 183281 40381549.0 189565 39291401.0 Name: Approved_Labour_Budget, dtype: float64 --- Column: Total_Exp --- Upper outliers (above 19898.56): 187438 167008.099515 186158 156955.066792 64564 150456.560210 84340 142394.341368 188138 133027.931497 Name: Total_Exp, dtype: float64 --- Column: Wages --- Upper outliers (above 14224.47): 183283 102775.107117 187438 102475.541585 249508 95845.852313 186158 92595.637800 182200 89041.055471 Name: Wages, dtype: float64 --- Column: Total_No_of_Workers --- Upper outliers (above 1018636.12): 188457 3292845.0 86973 3292767.0 85614 3292578.0 185100 3292192.0 134357 3291349.0 Name: Total_No_of_Workers, dtype: float64 --- Column: Average_Wage_rate_per_day_per_person --- Lower outliers (below 119.35): 46636 0.0 230648 0.0 230649 0.0 230651 0.0 230654 0.0 Name: Average_Wage_rate_per_day_per_person, dtype: float64 Upper outliers (above 396.81): 26605 44392173.06 291097 34090333.06 112921 33569493.85 129233 30510851.65 128729 22121326.60 Name: Average_Wage_rate_per_day_per_person, dtype: float64
In [25]:
print("\n--- Starting ETL Step 5: Outlier Treatment for Critical Numerical Columns ---")
# --- Outlier Treatment for Average_Wage_rate_per_day_per_person ---
wage_col = 'Average_Wage_rate_per_day_per_person'
# Calculate the 99.9th percentile for capping upper outliers
# Using a high percentile helps preserve high but legitimate values
upper_cap_wage = df[wage_col].quantile(0.999)
# A more aggressive cap if the 99.9th percentile is still very high (e.g., above 5000-10000)
# Let's say max reasonable daily wage is 5000 for this context
domain_max_wage = 5000.0
final_upper_cap_wage = min(upper_cap_wage, domain_max_wage)
# Cap values above the final_upper_cap_wage
df[wage_col] = np.where(df[wage_col] > final_upper_cap_wage, final_upper_cap_wage, df[wage_col])
print(f"Capped upper outliers in '{wage_col}' at {final_upper_cap_wage:.2f}.")
--- Starting ETL Step 5: Outlier Treatment for Critical Numerical Columns --- Capped upper outliers in 'Average_Wage_rate_per_day_per_person' at 5000.00.
In [26]:
# Replace zero values (and any other unrealistic low values, if needed) with NaN for imputation
# Based on IQR output, 0.0 is present as a lower outlier and is problematic for an "average wage"
df[wage_col] = df[wage_col].replace(0.0, np.nan)
# Impute NaN values with the *median* of the cleaned wage column
# Median is preferred over mean due to potential remaining skewness and robustness to outliers
median_wage = df[wage_col].median()
df[wage_col] = df[wage_col].fillna(median_wage)
print(f"Replaced 0.0 values (and NaNs) in '{wage_col}' with median: {median_wage:.2f}.")
Replaced 0.0 values (and NaNs) in 'Average_Wage_rate_per_day_per_person' with median: 252.49.
In [27]:
# --- Outlier Treatment for percentage_payments_gererated_within_15_days ---
payments_col = 'percentage_payments_gererated_within_15_days'
# Clip values to be strictly within [0, 100] as it's a percentage
df[payments_col] = np.clip(df[payments_col], 0, 100)
print(f"Clipped '{payments_col}' to be within [0, 100].")
Clipped 'percentage_payments_gererated_within_15_days' to be within [0, 100].
In [28]:
# --- Re-check descriptive statistics for the treated columns to confirm impact ---
print("\n--- Descriptive Statistics for Treated Columns (After Outlier Treatment) ---")
treated_cols = [wage_col, payments_col]
print(df[treated_cols].describe().T)
--- Descriptive Statistics for Treated Columns (After Outlier Treatment) ---
count mean \
Average_Wage_rate_per_day_per_person 302752.0 357.064330
percentage_payments_gererated_within_15_days 302752.0 95.842601
std min 25% \
Average_Wage_rate_per_day_per_person 606.383738 0.00365 230.892736
percentage_payments_gererated_within_15_days 19.799117 0.00000 99.930000
50% 75% max
Average_Wage_rate_per_day_per_person 252.492009 292.766889 5000.0
percentage_payments_gererated_within_15_days 100.000000 100.000000 100.0
In [29]:
print("\n--- Skewness and Kurtosis for Treated Columns (After Outlier Treatment) ---")
print(df[treated_cols].skew().rename('skew'))
print(df[treated_cols].kurt().rename('kurtosis'))
--- Skewness and Kurtosis for Treated Columns (After Outlier Treatment) --- Average_Wage_rate_per_day_per_person 6.857600 percentage_payments_gererated_within_15_days -4.633732 Name: skew, dtype: float64 Average_Wage_rate_per_day_per_person 47.746503 percentage_payments_gererated_within_15_days 19.473925 Name: kurtosis, dtype: float64
In [30]:
# --- Visualize distributions of treated columns to confirm improvements ---
plt.figure(figsize=(14, 6))
# Histogram for Average_Wage_rate_per_day_per_person
plt.subplot(1, 2, 1)
sns.histplot(df[wage_col], kde=True, bins=50, color='skyblue')
plt.title(f'Distribution of {wage_col} (Treated)')
plt.xlabel('Average Daily Wage Rate (₹)')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)
# Histogram for percentage_payments_gererated_within_15_days
plt.subplot(1, 2, 2)
sns.histplot(df[payments_col], kde=True, bins=50, color='lightcoral')
plt.title(f'Distribution of {payments_col} (Treated)')
plt.xlabel('Percentage of Payments within 15 Days (%)')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
In [31]:
# Box Plot for Average_Wage_rate_per_day_per_person
plt.figure(figsize=(10, 5))
sns.boxplot(x=df[wage_col], color='skyblue')
plt.title(f'Box Plot of {wage_col} (Treated)')
plt.xlabel('Average Daily Wage Rate (₹)')
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.show()
In [32]:
# Box Plot for percentage_payments_gererated_within_15_days
plt.figure(figsize=(10, 5))
sns.boxplot(x=df[payments_col], color='lightcoral')
plt.title(f'Box Plot of {payments_col} (Treated)')
plt.xlabel('Percentage of Payments within 15 Days (%)')
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.show()
# Save the DataFrame after outlier treatment
try:
df.to_csv(FULLY_CLEANED_CSV_FILE, index=False)
print(f"\nDataFrame saved after ETL Step 5 to: {FULLY_CLEANED_CSV_FILE}")
except Exception as e:
print(f"Error saving DataFrame after ETL Step 5: {e}")
Error saving DataFrame after ETL Step 5: [Errno 13] Permission denied: 'Datasets\\Cleaned_Preprocessed\\mgnrega_data_fully_cleaned.csv'
In [33]:
# Load the fully cleaned dataset with robust data handling
try:
df = pd.read_csv(FULLY_CLEANED_CSV_FILE)
# --- Robust Numerical Column Re-conversion AFTER CSV Load ---
# Ensure all numerical columns are correctly typed and handle inf/NaN.
numerical_cols_to_reconvert = [
'Approved_Labour_Budget', 'Average_Wage_rate_per_day_per_person',
'Average_days_of_employment_provided_per_Household', 'Differently_abled_persons_worked',
'Material_and_skilled_Wages', 'Number_of_Completed_Works',
'Number_of_GPs_with_NIL_exp', 'Number_of_Ongoing_Works',
'Persondays_of_Central_Liability_so_far', 'SC_persondays',
'SC_workers_against_active_workers', 'ST_persondays',
'ST_workers_against_active_workers', 'Total_Adm_Expenditure', 'Total_Exp',
'Total_Households_Worked', 'Total_Individuals_Worked',
'Total_No_of_Active_Job_Cards', 'Total_No_of_Active_Workers',
'Total_No_of_HHs_completed_100_Days_of_Wage_Employment',
'Total_No_of_JobCards_issued', 'Total_No_of_Workers',
'Total_No_of_Works_Takenup', 'Wages', 'Women_Persondays',
'percent_of_Category_B_Works', 'percent_of_Expenditure_on_Agriculture_Allied_Works',
'percent_of_NRM_Expenditure', 'percentage_payments_gererated_within_15_days'
]
for col in numerical_cols_to_reconvert:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce')
df[col] = df[col].replace([np.inf, -np.inf], np.nan).fillna(0) # Handle inf/NaN with 0
df['state_code'] = df['state_code'].fillna(-1).astype(int)
df['district_code'] = df['district_code'].fillna(-1).astype(int)
# --- NO GLOBAL DATE COLUMN CREATION (as discussed, will derive as needed) ---
df['State'] = df['State'].astype('category')
df['District'] = df['District'].astype('category')
print(f"Loaded fully cleaned data from: {FULLY_CLEANED_CSV_FILE}")
print(f"DataFrame shape after robust numerical handling: {df.shape}")
# --- Filter out 2024-2025 and 2025-2026 for consistent historical analysis (NEW) ---
print("\n--- Filtering Data for Consistent Historical Analysis (Excluding 2024-2025 and 2025-2026) ---")
years_to_exclude = ['2024-2025', '2025-2026']
df = df[~df['fin_year'].isin(years_to_exclude)].copy() # Filter the main DataFrame
print(f"DataFrame shape after filtering {years_to_exclude}: {df.shape}")
print(f"Financial years remaining in data: {df['fin_year'].unique()}")
# --- Data Sorting ---
print("\n--- Sorting DataFrame by Financial Year, Month (Financial Order), State Code, and District Code ---")
# 1. Define month order for Indian Financial Year (April to March)
financial_month_order = ['April', 'May', 'June', 'July', 'August', 'September',
'October', 'November', 'December', 'January', 'February', 'March']
# Create a more robust month_to_num mapping to handle both full and short month names
# This dictionary is mainly for converting to numerical keys for comparison/sorting if needed,
# but the primary sorting will leverage CategoricalDtype
month_to_num_for_plotting = { # Use a different name to avoid confusion with sorting key
'April': 0, 'May': 1, 'June': 2, 'July': 3, 'August': 4, 'September': 5,
'October': 6, 'November': 7, 'December': 8, 'January': 9, 'February': 10, 'March': 11,
'Apr': 0, 'May': 1, 'Jun': 2, 'Jul': 3, 'Aug': 4, 'Sep': 5,
'Oct': 6, 'Nov': 7, 'Dec': 8, 'Jan': 9, 'Feb': 10, 'Mar': 11
}
# 2. Create numerical sorting key for fin_year
df['fin_year_sort_key'] = df['fin_year'].astype(str).str.split('-').str[0].astype(int)
# 3. Process 'month' column and convert to an ORDERED CategoricalDtype
df['month_processed'] = df['month'].astype(str).str.strip().str.title()
df['month_full_name_for_sort'] = df['month_processed'].apply(
lambda x: next((m_full for m_full in financial_month_order if m_full.startswith(x)), x)
)
df['month_ordered'] = pd.Categorical(df['month_full_name_for_sort'], categories=financial_month_order, ordered=True)
# 4. Sort the DataFrame using the custom ordered 'month_ordered'
df.sort_values(by=['fin_year_sort_key', 'month_ordered', 'state_code', 'district_code'], inplace=True)
# 5. Drop temporary sorting keys
df.drop(columns=['fin_year_sort_key', 'month_processed', 'month_full_name_for_sort', 'month_ordered'], inplace=True, errors='ignore')
print("DataFrame sorted successfully.")
print(df[['fin_year', 'month', 'state_code', 'State', 'district_code', 'District']].head(20))
# --- Save the fully cleaned, sorted, and FILTERED DataFrame (UPDATED PATH) ---
df.to_csv(FINAL_FILTERED_SORTED_CSV_FILE, index=False)
print(f"\nFiltered data for analysis saved to: {FINAL_FILTERED_SORTED_CSV_FILE}")
except FileNotFoundError:
print(f"Error: The file '{FULLY_CLEANED_CSV_FILE}' was not found. Please ensure the path is correct.")
exit()
except Exception as e:
print(f"An unexpected error occurred while loading the dataset: {e}")
exit()
Loaded fully cleaned data from: Datasets\Cleaned_Preprocessed\mgnrega_data_fully_cleaned.csv
DataFrame shape after robust numerical handling: (50892, 40)
--- Filtering Data for Consistent Historical Analysis (Excluding 2024-2025 and 2025-2026) ---
DataFrame shape after filtering ['2024-2025', '2025-2026']: (50892, 40)
Financial years remaining in data: ['2018-2019' '2019-2020' '2020-2021' '2021-2022' '2022-2023' '2023-2024']
--- Sorting DataFrame by Financial Year, Month (Financial Order), State Code, and District Code ---
DataFrame sorted successfully.
fin_year month state_code State district_code \
0 2018-2019 April 1 ANDAMAN AND NICOBAR 101
1 2018-2019 April 1 ANDAMAN AND NICOBAR 102
2 2018-2019 April 1 ANDAMAN AND NICOBAR 103
3 2018-2019 April 2 ANDHRA PRADESH 201
4 2018-2019 April 2 ANDHRA PRADESH 202
5 2018-2019 April 2 ANDHRA PRADESH 203
6 2018-2019 April 2 ANDHRA PRADESH 204
7 2018-2019 April 2 ANDHRA PRADESH 205
8 2018-2019 April 2 ANDHRA PRADESH 206
9 2018-2019 April 2 ANDHRA PRADESH 207
10 2018-2019 April 2 ANDHRA PRADESH 208
11 2018-2019 April 2 ANDHRA PRADESH 209
12 2018-2019 April 2 ANDHRA PRADESH 210
13 2018-2019 April 2 ANDHRA PRADESH 211
14 2018-2019 April 2 ANDHRA PRADESH 212
15 2018-2019 April 2 ANDHRA PRADESH 213
16 2018-2019 April 3 ARUNACHAL PRADESH 301
17 2018-2019 April 3 ARUNACHAL PRADESH 302
18 2018-2019 April 3 ARUNACHAL PRADESH 303
19 2018-2019 April 3 ARUNACHAL PRADESH 304
District
0 SOUTH ANDAMAN
1 NICOBARS
2 NORTH AND MIDDLE ANDAMAN
3 SRIKAKULAM
4 VIZIANAGARAM
5 VISAKHAPATANAM
6 EAST GODAVARI
7 WEST GODAVARI
8 KRISHNA
9 GUNTUR
10 PRAKASAM
11 NELLORE
12 CHITTOOR
13 Y.S.R
14 ANANTAPUR
15 KURNOOL
16 TAWANG
17 WEST KAMENG
18 EAST KAMENG
19 LOWER SUBANSIRI
An unexpected error occurred while loading the dataset: name 'FINAL_FILTERED_SORTED_CSV_FILE' is not defined
In [34]:
print("\n--- Starting EDA Step 3: Refined Visualizations and Deep Dive into Trends and Patterns ---")
# --- Feature Engineering for deeper insights (re-run to ensure consistency) ---
df['Women_Persondays_Ratio'] = df['Women_Persondays'] / df['Persondays_of_Central_Liability_so_far']
df['Women_Persondays_Ratio'] = df['Women_Persondays_Ratio'].replace([np.inf, -np.inf], np.nan).fillna(0)
df['Women_Persondays_Ratio'] = np.clip(df['Women_Persondays_Ratio'], 0, 1)
df['SC_Persondays_Ratio'] = df['SC_persondays'] / df['Persondays_of_Central_Liability_so_far']
df['SC_Persondays_Ratio'] = df['SC_Persondays_Ratio'].replace([np.inf, -np.inf], np.nan).fillna(0)
df['SC_Persondays_Ratio'] = np.clip(df['SC_Persondays_Ratio'], 0, 1)
df['ST_Persondays_Ratio'] = df['ST_persondays'] / df['Persondays_of_Central_Liability_so_far']
df['ST_Persondays_Ratio'] = df['ST_Persondays_Ratio'].replace([np.inf, -np.inf], np.nan).fillna(0)
df['ST_Persondays_Ratio'] = np.clip(df['ST_Persondays_Ratio'], 0, 1)
df['100_Days_HH_Ratio'] = df['Total_No_of_HHs_completed_100_Days_of_Wage_Employment'] / df['Total_Households_Worked']
df['100_Days_HH_Ratio'] = df['100_Days_HH_Ratio'].replace([np.inf, -np.inf], np.nan).fillna(0)
df['100_Days_HH_Ratio'] = np.clip(df['100_Days_HH_Ratio'], 0, 1)
print("\n--- Feature Engineering Completed ---")
--- Starting EDA Step 3: Refined Visualizations and Deep Dive into Trends and Patterns --- --- Feature Engineering Completed ---
In [35]:
# --- Analysis by State (Visualizations with refined plots and explicit hue) ---
print("\n--- State-wise Aggregated Metrics ---")
# Aggregate all state performance metrics
state_performance = df.groupby('State').agg(
Total_Expenditure=('Total_Exp', 'sum'),
Total_Workers=('Total_No_of_Workers', 'sum'),
Avg_Daily_Wage=('Average_Wage_rate_per_day_per_person', 'mean'),
Avg_100_Days_HH_Ratio=('100_Days_HH_Ratio', 'mean'),
Avg_Payment_Efficiency=('percentage_payments_gererated_within_15_days', 'mean'),
Total_Women_Persondays=('Women_Persondays', 'sum'),
Avg_Women_Persondays_Ratio=('Women_Persondays_Ratio', 'mean'),
Avg_NRM_Exp_Percent=('percent_of_NRM_Expenditure', 'mean')
)
print("\n--- State Performance Aggregated Data (Head) ---")
print(state_performance.head())
print("\n--- State Performance Aggregated Data (Describe) ---")
print(state_performance.describe())
--- State-wise Aggregated Metrics ---
--- State Performance Aggregated Data (Head) ---
Total_Expenditure Total_Workers Avg_Daily_Wage \
State
ANDAMAN AND NICOBAR 2.324324e+04 4.157408e+06 460.459545
ANDHRA PRADESH 4.084342e+07 1.289226e+09 207.741754
ARUNACHAL PRADESH 1.436587e+06 3.570200e+07 441.097322
ASSAM 7.246806e+06 7.832448e+08 209.195901
BIHAR 2.381411e+07 1.836347e+09 182.112772
Avg_100_Days_HH_Ratio Avg_Payment_Efficiency \
State
ANDAMAN AND NICOBAR 0.006548 71.759259
ANDHRA PRADESH 0.046128 99.982532
ARUNACHAL PRADESH 0.001937 74.132324
ASSAM 0.004152 99.809024
BIHAR 0.002771 99.994145
Total_Women_Persondays Avg_Women_Persondays_Ratio \
State
ANDAMAN AND NICOBAR 3.637515e+06 0.526728
ANDHRA PRADESH 7.420281e+09 0.587645
ARUNACHAL PRADESH 1.768506e+08 0.387847
ASSAM 1.350378e+09 0.450713
BIHAR 4.431626e+09 0.532949
Avg_NRM_Exp_Percent
State
ANDAMAN AND NICOBAR 44.065602
ANDHRA PRADESH 65.776250
ARUNACHAL PRADESH 39.173804
ASSAM 56.364368
BIHAR 38.235881
--- State Performance Aggregated Data (Describe) ---
Total_Expenditure Total_Workers Avg_Daily_Wage \
count 3.400000e+01 3.400000e+01 34.000000
mean 1.102063e+07 6.200833e+08 270.371549
std 1.258963e+07 7.058387e+08 149.850695
min 3.299419e+02 1.174792e+06 155.342062
25% 1.991391e+06 4.070123e+07 186.394592
50% 4.150339e+06 2.019835e+08 212.583944
75% 1.734133e+07 1.141402e+09 276.175995
max 4.084342e+07 2.197404e+09 864.677323
Avg_100_Days_HH_Ratio Avg_Payment_Efficiency Total_Women_Persondays \
count 34.000000 34.000000 3.400000e+01
mean 0.021129 92.198627 2.132984e+09
std 0.019542 18.583462 2.855728e+09
min 0.000000 6.944444 5.353100e+04
25% 0.006618 95.998545 2.608064e+08
50% 0.013206 99.801665 7.725087e+08
75% 0.033456 99.979579 2.980862e+09
max 0.064929 100.000000 1.122258e+10
Avg_Women_Persondays_Ratio Avg_NRM_Exp_Percent
count 34.000000 34.000000
mean 0.506298 55.412519
std 0.170180 13.989864
min 0.077807 8.152778
25% 0.428733 50.184504
50% 0.499531 56.584258
75% 0.586700 61.543056
max 0.891152 85.012500
In [ ]:
# Visualizations: Bar Charts for State Performance - Now showing all states, sorted by value, with value labels
fig, axes = plt.subplots(3, 2, figsize=(20, 25))
fig.suptitle('State-wise Key MGNREGA Performance Indicators', fontsize=22) # Changed title
plt.subplots_adjust(left=0.1, right=0.9, top=0.92, bottom=0.08, hspace=0.4, wspace=0.3)
# Helper function to plot bar charts for state performance with values
def plot_state_bar_chart_with_values(ax, data, x_col, y_col, title, x_label, y_label, formatter=None, palette='viridis', ascending=False):
# Sort data for plotting (highest values at the top)
sorted_data = data.sort_values(by=x_col, ascending=ascending).reset_index()
if not sorted_data.empty and not sorted_data[x_col].isnull().all() and not (sorted_data[x_col] == 0).all():
bars = sns.barplot(x=x_col, y=y_col, data=sorted_data, ax=ax, palette=palette)
ax.set_title(title, fontsize=16)
ax.set_xlabel(x_label, fontsize=13)
ax.set_ylabel(y_label, fontsize=13)
if formatter:
ax.xaxis.set_major_formatter(plt.FuncFormatter(formatter))
ax.tick_params(axis='x', labelsize=11)
ax.tick_params(axis='y', labelsize=11)
ax.grid(axis='x', linestyle='--', alpha=0.7)
# Annotate bars with values
for container in ax.containers: # Iterate through containers for bar_label
ax.bar_label(container, fmt=formatter if formatter else '%.0f',
labels=[formatter(v, None) if formatter else f'{v:.0f}' for v in container.datavalues],
fontsize=8, padding=3, color='black') # Default to black text, can refine
# --- Dynamic xlim adjustment for better visibility ---
if sorted_data[x_col].max() > 0:
current_max_x = sorted_data[x_col].max()
if 'Cr' in x_label:
ax.set_xlim(left=0, right=current_max_x * 1.1)
elif 'M' in x_label:
ax.set_xlim(left=0, right=current_max_x * 1.1)
elif 'Ratio' in x_label:
ax.set_xlim(left=0, right=1.0)
elif 'Percentage' in x_label:
ax.set_xlim(left=0, right=100.0)
elif 'Wage' in x_label:
ax.set_xlim(left=0, right=max(current_max_x * 1.1, 850)) # Cap wage at 850 if values are low, otherwise scale to max
else:
ax.set_xlim(left=0, right=current_max_x * 1.1)
else:
ax.set_title(f"No meaningful data for {title.split('by ')[1]}", fontsize=16)
ax.text(0.5, 0.5, 'Data not available or all zeros', horizontalalignment='center', verticalalignment='center', transform=ax.transAxes, fontsize=14, color='red')
ax.set_visible(True) # Ensure axis is visible to show "No Data" message
ax.set_xticks([])
ax.set_yticks([])
# Plot 1: States by Total Expenditure (₹)
plot_state_bar_chart_with_values(axes[0, 0], state_performance, # Changed to show all states
'Total_Expenditure', 'State',
'States by Total Expenditure (₹)', 'Total Expenditure', 'State',
lambda x, _: f'{x/1e7:.1f} Cr', 'viridis', ascending=True)
# Comment for Plot 1
"""
Purpose: To visualize the total financial investment in MGNREGA across all states.
Insights: Identifies states with the largest and smallest overall expenditure.
Interpretation: Highlights states with substantial program scale. Large disparities suggest varying levels of program implementation or regional demands for work. The bar labels provide exact figures, aiding precise comparison.
"""
# Plot 2: States by Total Workers Employed
plot_state_bar_chart_with_values(axes[0, 1], state_performance, # Changed to show all states
'Total_Workers', 'State', 'States by Total Workers Employed',
'Total Workers', 'State', lambda x, _: f'{x/1e6:.1f} M', 'magma', ascending=True)
# Comment for Plot 2
"""
Purpose: To visualize the total number of workers engaged in MGNREGA activities across all states.
Insights: Shows the geographical reach of the scheme in terms of human resource engagement.
Interpretation: High worker numbers are critical for livelihood support. These figures are cumulative over time, not unique individuals, indicating the volume of employment generated. Labels offer precise worker counts.
"""
# Plot 3: States by Average Daily Wage (₹)
plot_state_bar_chart_with_values(axes[1, 0], state_performance, # Changed to show all states
'Avg_Daily_Wage', 'State', 'States by Average Daily Wage (₹)',
'Average Daily Wage (₹)', 'State', palette='plasma', ascending=True)
# Comment for Plot 3
"""
Purpose: To compare the average daily wage rates offered by MGNREGA across different states.
Insights: Reveals regional variations in compensation for work under the scheme.
Interpretation: Differences in average wages can stem from variations in state-specific minimum wages or the nature of work (skilled vs. unskilled). Higher wages improve worker earnings, while lower wages might indicate areas needing policy review to ensure fair compensation.
"""
# Plot 4: States by Average 100-Days HH Ratio
plot_state_bar_chart_with_values(axes[1, 1], state_performance, # Changed to show all states
'Avg_100_Days_HH_Ratio', 'State', 'States by Average 100-Days HH Ratio',
'Average Ratio', 'State', palette='coolwarm', ascending=True)
# Comment for Plot 4
"""
Purpose: To assess the proportion of households completing the guaranteed 100 days of employment across states.
Insights: A key indicator of the scheme's effectiveness in providing sustained livelihood support.
Interpretation: Very low ratios across all states suggest a significant gap between the scheme's intent and its execution on the ground. States with higher ratios are more successful in this core objective. This gap could be due to demand-side issues (workers not seeking 100 days) or supply-side issues (lack of available work, administrative hurdles).
"""
# Plot 5: States by Average Payment Efficiency (%)
plot_state_bar_chart_with_values(axes[2, 0], state_performance, # Changed to show all states
'Avg_Payment_Efficiency', 'State', 'States by Average Payment Efficiency (%)',
'Average Percentage (%)', 'State', palette='OrRd', ascending=True)
# Comment for Plot 5
"""
Purpose: To visualize states' administrative efficiency in disbursing wages within the stipulated 15 days.
Insights: Highlights states with high and low adherence to timely payment.
Interpretation: High payment efficiency is crucial for worker welfare, reducing financial stress and increasing trust in the scheme. Low efficiency in some states indicates administrative bottlenecks that require urgent attention.
"""
# Plot 6: States by Average Women Persondays Ratio
plot_state_bar_chart_with_values(axes[2, 1], state_performance, # Changed to show all states
'Avg_Women_Persondays_Ratio', 'State', 'States by Average Women Persondays Ratio',
'Average Ratio', 'State', palette='viridis', ascending=True)
# Comment for Plot 6
"""
Purpose: To assess gender inclusivity and the proportion of employment generated for women across states.
Insights: Reveals states that are more successful in mobilizing and providing work for women.
Interpretation: Higher ratios signify better gender equity in MGNREGA implementation. This is a vital social equity metric, indicating successful efforts in empowering women in the rural workforce. Lower ratios might suggest cultural barriers or lack of suitable work for women in those regions.
"""
plt.tight_layout(rect=[0, 0.03, 1, 0.96])
plt.show()
The Kernel crashed while executing code in the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. View Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details.
In [ ]:
# --- Analysis by Time (Financial Year and Month) ---
print("\n--- Trends Over Financial Years ---")
# --- Deriving calendar_year and calendar_month on-the-fly for time series ---
# This ensures we are not relying on a globally created 'Date' column and its potential issues
# Updated month_order to reflect Indian financial year starting April
financial_month_order_for_plot = ['April', 'May', 'June', 'July', 'August', 'September',
'October', 'November', 'December', 'January', 'February', 'March']
short_to_full_month_map = {
'Jan': 'January', 'Feb': 'February', 'Mar': 'March', 'Apr': 'April',
'May': 'May', 'Jun': 'June', 'Jul': 'July', 'Aug': 'August',
'Sep': 'September', 'Oct': 'October', 'Nov': 'November', 'Dec': 'December'
}
month_to_num_for_fy = {month: i for i, month in enumerate(financial_month_order_for_plot)}
# Create a temporary DataFrame for time series analysis
df_temp_time = df.copy() # df is already filtered now
# Ensure 'month' column is clean before mapping
df_temp_time['month_cleaned_for_temp'] = df_temp_time['month'].astype(str).str.strip().str.title()
# Map short names to full names, and then get numerical order
df_temp_time['month_full_name'] = df_temp_time['month_cleaned_for_temp'].map(short_to_full_month_map).fillna(df_temp_time['month_cleaned_for_temp'])
df_temp_time['month_num_fy_order'] = df_temp_time['month_full_name'].map(month_to_num_for_fy)
df_temp_time['start_year_temp'] = df_temp_time['fin_year'].astype(str).str.split('-').str[0].astype(int)
# Corrected get_calendar_year_for_temp logic for Indian financial year (April-March)
def get_calendar_year_for_temp(row):
start_year = row['start_year_temp']
month_num_fy_order = row['month_num_fy_order'] # Using the order number for current year determination
if pd.isna(start_year) or pd.isna(month_num_fy_order):
return np.nan
# If month is April-Dec (0-8 in financial_month_order_for_plot index), it's the start year
# If month is Jan-Mar (9-11 in financial_month_order_for_plot index), it's start_year + 1
if month_num_fy_order >= month_to_num_for_fy['April']: # Months from April to December (index 0 to 8)
return int(start_year)
else: # Months January to March (index 9 to 11)
return int(start_year + 1)
df_temp_time['calendar_year_temp'] = df_temp_time.apply(get_calendar_year_for_temp, axis=1)
# Drop rows with NaNs in necessary temporal columns
df_temp_time.dropna(subset=['month_num_fy_order', 'calendar_year_temp'], inplace=True)
df_temp_time['calendar_year_temp'] = df_temp_time['calendar_year_temp'].astype(int)
# Filter for yearly trends (no further filtering needed, df is already filtered)
yearly_trends = df_temp_time.groupby('fin_year').agg(
Total_Expenditure=('Total_Exp', 'sum'),
Total_Workers=('Total_No_of_Workers', 'sum'),
Avg_Daily_Wage=('Average_Wage_rate_per_day_per_person', 'mean'),
Avg_100_Days_HH_Ratio=('100_Days_HH_Ratio', 'mean'),
Avg_Payment_Efficiency=('percentage_payments_gererated_within_15_days', 'mean')
).sort_values(by='fin_year')
print("Yearly Trends (Operating on filtered data):")
print(yearly_trends)
# Refined Yearly Trends Plot: Using two separate line plots for clarity
fig, ax1 = plt.subplots(figsize=(15, 8))
# Plot Total Expenditure
color = 'tab:blue'
ax1.set_xlabel('Financial Year', fontsize=12)
ax1.set_ylabel('Total Expenditure (Crores ₹)', color=color, fontsize=12)
line1 = ax1.plot(yearly_trends.index, yearly_trends['Total_Expenditure'] / 1e7, color=color, marker='o', label='Total Expenditure')
ax1.tick_params(axis='y', labelcolor=color)
ax1.tick_params(axis='x', rotation=45)
ax1.grid(axis='y', linestyle='--', alpha=0.7)
# Annotate points for Total Expenditure
for x, y in zip(yearly_trends.index, yearly_trends['Total_Expenditure'] / 1e7):
ax1.annotate(f'{y:.1f} Cr', (x, y), textcoords="offset points", xytext=(0,10), ha='center', fontsize=8, color=color)
# Plot Total Workers on a twin axis
ax2 = ax1.twinx()
color = 'tab:orange'
ax2.set_ylabel('Total Workers (Millions)', color=color, fontsize=12)
line2 = ax2.plot(yearly_trends.index, yearly_trends['Total_Workers'] / 1e6, color=color, marker='x', label='Total Workers')
ax2.tick_params(axis='y', labelcolor=color)
# Annotate points for Total Workers
for x, y in zip(yearly_trends.index, yearly_trends['Total_Workers'] / 1e6):
ax2.annotate(f'{y:.1f} M', (x, y), textcoords="offset points", xytext=(0,-15), ha='center', fontsize=8, color=color)
fig.suptitle('Yearly Trends: Total Expenditure and Workers Employed (2018-2024)', fontsize=16) # Updated title
# Add a combined legend from both axes
lines_combined = line1 + line2
labels_combined = [l.get_label() for l in lines_combined]
ax2.legend(lines_combined, labels_combined, loc='upper left', bbox_to_anchor=(0.05, 0.95))
fig.tight_layout()
plt.show()
# Comment for Yearly Trends: Total Expenditure and Workers Employed
"""
Purpose: To visualize the overall temporal evolution of financial investment and human resource utilization under MGNREGA for historically consistent years.
Insights: Reveals major spikes or dips in activity for comparable periods. Excludes 2024-2025 and 2025-2026 for accurate trend representation due to data granularity and completeness issues.
Interpretation: A generally increasing trend over years indicates program expansion or increased demand. Consistent trend analysis is now possible. Scaling to crores and millions helps in readability for large numbers. Annotated values provide precise figures for each year.
"""
--- Trends Over Financial Years ---
Yearly Trends (Operating on filtered data):
Total_Expenditure Total_Workers Avg_Daily_Wage \
fin_year
2018-2019 4.332698e+07 3.307431e+09 215.354496
2019-2020 4.202751e+07 3.374215e+09 225.300330
2020-2021 7.354912e+07 3.713326e+09 231.115160
2021-2022 7.468682e+07 3.837848e+09 232.384528
2022-2023 6.748102e+07 3.671221e+09 274.078549
2023-2024 7.362989e+07 3.178792e+09 269.381790
Avg_100_Days_HH_Ratio Avg_Payment_Efficiency
fin_year
2018-2019 0.021104 95.018298
2019-2020 0.018640 95.453846
2020-2021 0.027561 97.118983
2021-2022 0.024389 97.668608
2022-2023 0.016337 98.318128
2023-2024 0.019112 97.457468
Out[ ]:
'\nPurpose: To visualize the overall temporal evolution of financial investment and human resource utilization under MGNREGA for historically consistent years.\nInsights: Reveals major spikes or dips in activity for comparable periods. Excludes 2024-2025 and 2025-2026 for accurate trend representation due to data granularity and completeness issues.\nInterpretation: A generally increasing trend over years indicates program expansion or increased demand. Consistent trend analysis is now possible. Scaling to crores and millions helps in readability for large numbers. Annotated values provide precise figures for each year.\n'
In [ ]:
# Separate plots for other yearly trends for clarity
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
fig.suptitle('Yearly Trends: Average Wages, 100-Days HH Ratio, and Payment Efficiency (2018-2024)', fontsize=16) # Updated title
# Plot Avg_Daily_Wage by Year
sns.lineplot(x=yearly_trends.index, y='Avg_Daily_Wage', data=yearly_trends, marker='o', ax=axes[0], color='green')
axes[0].set_title('Average Daily Wage Rate by Year', fontsize=14)
axes[0].set_xlabel('Financial Year', fontsize=12)
axes[0].set_ylabel('Average Daily Wage (₹)', fontsize=12)
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(axis='y', linestyle='--', alpha=0.7)
# Annotate points
for x, y in zip(yearly_trends.index, yearly_trends['Avg_Daily_Wage']):
axes[0].annotate(f'{y:.0f}', (x, y), textcoords="offset points", xytext=(0,10), ha='center', fontsize=8, color='green')
# Comment for Plot (Avg_Daily_Wage)
"""
Purpose: To track the trend of average daily wages provided under MGNREGA over time for consistent years.
Insights: Shows whether worker income is increasing or stagnating.
Interpretation: A consistent upward trend is a positive sign for improving livelihood standards under the scheme. Any stagnation or decrease would warrant concern and policy review. Excludes anomalous years. Annotated values provide precise figures.
"""
# Plot Avg_100_Days_HH_Ratio by Year
sns.lineplot(x=yearly_trends.index, y='Avg_100_Days_HH_Ratio', data=yearly_trends, marker='o', ax=axes[1], color='purple')
axes[1].set_title('Average 100-Days HH Ratio by Year', fontsize=14)
axes[1].set_xlabel('Financial Year', fontsize=12)
axes[1].set_ylabel('Average Ratio', fontsize=12)
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(axis='y', linestyle='--', alpha=0.7)
# Annotate points
for x, y in zip(yearly_trends.index, yearly_trends['Avg_100_Days_HH_Ratio']):
axes[1].annotate(f'{y:.3f}', (x, y), textcoords="offset points", xytext=(0,10), ha='center', fontsize=8, color='purple')
# Comment for Plot (Avg_100_Days_HH_Ratio)
"""
Purpose: To understand the longitudinal effectiveness of providing the full 100 days of guaranteed work for consistent years.
Insights: Reveals the scheme's success in delivering its core promise. Low or declining trends highlight challenges.
Interpretation: Observed trends in this ratio are critical for policy attention. Possible reasons for fluctuations include demand for work, administrative bottlenecks, or policy shifts. Annotated values provide precise figures.
"""
# Plot Avg_Payment_Efficiency by Year
sns.lineplot(x=yearly_trends.index, y='Avg_Payment_Efficiency', data=yearly_trends, marker='o', ax=axes[2], color='brown')
axes[2].set_title('Average Payment Efficiency by Year', fontsize=14)
axes[2].set_xlabel('Financial Year', fontsize=12)
axes[2].set_ylabel('Average Percentage (%)', fontsize=12)
axes[2].tick_params(axis='x', rotation=45)
axes[2].grid(axis='y', linestyle='--', alpha=0.7)
# Annotate points
for x, y in zip(yearly_trends.index, yearly_trends['Avg_Payment_Efficiency']):
axes[2].annotate(f'{y:.1f}%', (x, y), textcoords="offset points", xytext=(0,10), ha='center', fontsize=8, color='brown')
# Comment for Plot (Avg_Payment_Efficiency)
"""
Purpose: To monitor the efficiency of wage disbursement over financial years for consistent years.
Insights: Timely payments are crucial for worker welfare. A high and stable percentage is ideal.
Interpretation: Observed efficiency trends are critical for understanding administrative performance. Any dips warrant investigation, as payment delays can severely impact the financial stability of workers. Annotated values provide precise figures.
"""
plt.tight_layout(rect=[0, 0.03, 1, 0.96])
plt.show()
In [ ]:
# Monthly trends (considering all years for seasonality)
# df is already filtered, so df_temp_time will also be filtered
df_filtered_for_monthly = df_temp_time.copy()
df_filtered_for_monthly.dropna(subset=['month_num_fy_order', 'calendar_year_temp'], inplace=True)
# Debugging the content of df_filtered_for_monthly just before monthly aggregation
print("\n--- Debugging df_filtered_for_monthly before Monthly Aggregation ---")
print(f"Shape: {df_filtered_for_monthly.shape}")
print(f"month_num_fy_order nulls: {df_filtered_for_monthly['month_num_fy_order'].isnull().sum()}")
print(f"calendar_year_temp nulls: {df_filtered_for_monthly['calendar_year_temp'].isnull().sum()}")
print(f"Financial years present: {df_filtered_for_monthly['fin_year'].unique()}")
print(f"Sample of data for monthly trends:\n{df_filtered_for_monthly[['fin_year', 'month', 'Total_Exp', 'Total_No_of_Workers', 'Average_Wage_rate_per_day_per_person']].head()}")
print(f"Sum of Total_Exp in df_filtered_for_monthly: {df_filtered_for_monthly['Total_Exp'].sum()}")
print(f"Sum of Total_No_of_Workers in df_filtered_for_monthly: {df_filtered_for_monthly['Total_No_of_Workers'].sum()}")
df_filtered_for_monthly['calendar_month'] = df_filtered_for_monthly['month'].astype(str).str.strip().str.title().astype('category')
# Updated month_order to reflect Indian financial year starting April
month_order_financial_year = ['April', 'May', 'June', 'July', 'August', 'September',
'October', 'November', 'December', 'January', 'February', 'March']
# Re-map short month names to full for consistent ordering and mapping
short_to_full_month = {
'Jan': 'January', 'Feb': 'February', 'Mar': 'March', 'Apr': 'April',
'May': 'May', 'Jun': 'June', 'Jul': 'July', 'Aug': 'August',
'Sep': 'September', 'Oct': 'October', 'Nov': 'November', 'Dec': 'December'
}
df_filtered_for_monthly['calendar_month'] = df_filtered_for_monthly['calendar_month'].map(short_to_full_month).fillna(df_filtered_for_monthly['calendar_month']) # Map short to full, keep full as is
df_filtered_for_monthly['calendar_month'] = pd.Categorical(df_filtered_for_monthly['calendar_month'], categories=month_order_financial_year, ordered=True) # Use new financial year month order
monthly_trends = df_filtered_for_monthly.groupby('calendar_month').agg(
Total_Expenditure=('Total_Exp', 'sum'),
Total_Workers=('Total_No_of_Workers', 'sum'),
Avg_Daily_Wage=('Average_Wage_rate_per_day_per_person', 'mean'),
Avg_100_Days_HH_Ratio=('100_Days_HH_Ratio', 'mean'),
Avg_Payment_Efficiency=('percentage_payments_gererated_within_15_days', 'mean')
).reindex(month_order_financial_year) # Order months for consistent plotting
monthly_trends = monthly_trends.fillna(0)
--- Debugging df_filtered_for_monthly before Monthly Aggregation ---
Shape: (50892, 45)
month_num_fy_order nulls: 0
calendar_year_temp nulls: 0
Financial years present: ['2018-2019' '2019-2020' '2020-2021' '2021-2022' '2022-2023' '2023-2024']
Sample of data for monthly trends:
fin_year month Total_Exp Total_No_of_Workers \
0 2018-2019 April 4.369320 18558.0
1 2018-2019 April 0.000000 9786.0
2 2018-2019 April 0.150230 32903.0
3 2018-2019 April 6662.301165 1320134.0
4 2018-2019 April 6176.642097 1222586.0
Average_Wage_rate_per_day_per_person
0 37.013536
1 252.492009
2 252.492009
3 140.062250
4 123.668847
Sum of Total_Exp in df_filtered_for_monthly: 374701344.0047136
Sum of Total_No_of_Workers in df_filtered_for_monthly: 21082833425.0
In [ ]:
print("\n--- Monthly Trends (Aggregated Across All Years, for 2018-2024 FYs) ---") # Updated title
print(monthly_trends)
print("\n--- Monthly Trends Describe ---")
print(monthly_trends.describe())
fig, ax1 = plt.subplots(figsize=(15, 8))
color = 'tab:blue'
ax1.set_xlabel('Month', fontsize=12)
ax1.set_ylabel('Total Expenditure (Crores ₹)', color=color, fontsize=12)
line1_monthly = ax1.plot(monthly_trends.index, monthly_trends['Total_Expenditure'] / 1e7, color=color, marker='o', label='Total Expenditure')
ax1.tick_params(axis='y', labelcolor=color)
ax1.tick_params(axis='x', rotation=45)
ax1.grid(axis='y', linestyle='--', alpha=0.7)
# Annotate points for Total Expenditure (Monthly)
for x, y in zip(monthly_trends.index, monthly_trends['Total_Expenditure'] / 1e7):
ax1.annotate(f'{y:.1f} Cr', (x, y), textcoords="offset points", xytext=(0,10), ha='center', fontsize=8, color=color)
ax2 = ax1.twinx()
color = 'tab:orange'
ax2.set_ylabel('Total Workers (Millions)', color=color, fontsize=12)
line2_monthly = ax2.plot(monthly_trends.index, monthly_trends['Total_Workers'] / 1e6, color=color, marker='x', label='Total Workers')
ax2.tick_params(axis='y', labelcolor=color)
# Annotate points for Total Workers (Monthly)
for x, y in zip(monthly_trends.index, monthly_trends['Total_Workers'] / 1e6):
ax2.annotate(f'{y:.1f} M', (x, y), textcoords="offset points", xytext=(0,-15), ha='center', fontsize=8, color=color)
fig.suptitle('Monthly Trends: Total Expenditure and Workers Employed (2018-2024 FYs)', fontsize=16) # Updated title
lines_combined_monthly = line1_monthly + line2_monthly
labels_combined_monthly = [l.get_label() for l in lines_combined_monthly]
ax2.legend(lines_combined_monthly, labels_combined_monthly, loc='upper left', bbox_to_anchor=(0.05, 0.95))
fig.tight_layout()
plt.show()
# Comment for Monthly Trends: Total Expenditure and Workers Employed
"""
Purpose: To identify seasonal patterns in MGNREGA expenditure and worker engagement, aligned with the Indian financial year, using historically consistent data (2018-2024 FYs).
Insights: Reveals peak and lean periods of activity within the financial year.
Interpretation: Activity often aligns with agricultural cycles (e.g., lower demand during peak agricultural seasons, higher during lean periods). Understanding these cycles is crucial for efficient resource planning and matching work availability with demand. This plot aggregates data across all complete financial years. Annotated values provide precise figures.
"""
--- Monthly Trends (Aggregated Across All Years, for 2018-2024 FYs) ---
Total_Expenditure Total_Workers Avg_Daily_Wage \
calendar_month
April 3.501258e+06 1.759502e+09 306.991754
May 9.071760e+06 1.763155e+09 252.167022
June 1.613888e+07 1.765832e+09 269.522965
July 2.221178e+07 1.767538e+09 258.861407
August 2.722379e+07 1.767046e+09 250.516487
September 3.121897e+07 1.767238e+09 239.753141
October 3.486475e+07 1.766182e+09 232.320477
November 3.831889e+07 1.764144e+09 225.823328
December 4.218389e+07 1.755753e+09 221.710547
January 4.567729e+07 1.744826e+09 217.258905
February 4.909899e+07 1.736200e+09 210.659327
March 5.519109e+07 1.725416e+09 218.892321
Avg_100_Days_HH_Ratio Avg_Payment_Efficiency
calendar_month
April 0.000166 91.398682
May 0.001401 93.784956
June 0.004216 95.247326
July 0.006523 96.216331
August 0.008978 97.091014
September 0.012111 97.609672
October 0.015836 98.031049
November 0.020845 98.314447
December 0.028194 98.429422
January 0.036863 98.617868
February 0.049889 98.781608
March 0.069138 99.086138
--- Monthly Trends Describe ---
Total_Expenditure Total_Workers Avg_Daily_Wage \
count 1.200000e+01 1.200000e+01 12.000000
mean 3.122511e+07 1.756903e+09 242.039807
std 1.615437e+07 1.399384e+07 27.538471
min 3.501258e+06 1.725416e+09 210.659327
25% 2.069355e+07 1.753021e+09 221.005990
50% 3.304186e+07 1.763649e+09 236.036809
75% 4.305724e+07 1.766398e+09 253.840618
max 5.519109e+07 1.767538e+09 306.991754
Avg_100_Days_HH_Ratio Avg_Payment_Efficiency
count 12.000000 12.000000
mean 0.021180 96.884043
std 0.021351 2.347021
min 0.000166 91.398682
25% 0.005946 95.974080
50% 0.013973 97.820361
75% 0.030361 98.476534
max 0.069138 99.086138
Out[ ]:
'\nPurpose: To identify seasonal patterns in MGNREGA expenditure and worker engagement, aligned with the Indian financial year, using historically consistent data (2018-2024 FYs).\nInsights: Reveals peak and lean periods of activity within the financial year.\nInterpretation: Activity often aligns with agricultural cycles (e.g., lower demand during peak agricultural seasons, higher during lean periods). Understanding these cycles is crucial for efficient resource planning and matching work availability with demand. This plot aggregates data across all complete financial years. Annotated values provide precise figures.\n'
In [ ]:
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
fig.suptitle('Monthly Trends: Average Wages, 100-Days HH Ratio, and Payment Efficiency (2018-2024 FYs)', fontsize=16) # Updated title
# Plot Avg_Daily_Wage by Month
sns.lineplot(x=monthly_trends.index, y='Avg_Daily_Wage', data=monthly_trends, marker='o', ax=axes[0], color='green')
axes[0].set_title('Average Daily Wage Rate by Month', fontsize=14)
axes[0].set_xlabel('Month', fontsize=12)
axes[0].set_ylabel('Average Daily Wage (₹)', fontsize=12)
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(axis='y', linestyle='--', alpha=0.7)
# Annotate points
for x, y in zip(monthly_trends.index, monthly_trends['Avg_Daily_Wage']):
axes[0].annotate(f'{y:.0f}', (x, y), textcoords="offset points", xytext=(0,10), ha='center', fontsize=8, color='green')
# Comment for Plot (Avg_Daily_Wage by Month)
"""
Purpose: To analyze monthly variations in average daily wage rates, aligned with the Indian financial year, for consistent historical data.
Insights: Can show if wages fluctuate seasonally or are relatively stable throughout the year.
Interpretation: Significant monthly fluctuations might indicate changes in work types or local economic conditions. Relative stability, especially near the median, suggests consistent wage policy application. Annotated values provide precise figures.
"""
# Plot Avg_100_Days_HH_Ratio by Month
sns.lineplot(x=monthly_trends.index, y='Avg_100_Days_HH_Ratio', data=monthly_trends, marker='o', ax=axes[1], color='purple')
axes[1].set_title('Average 100-Days HH Ratio by Month', fontsize=14)
axes[1].set_xlabel('Month', fontsize=12)
axes[1].set_ylabel('Average Ratio', fontsize=12)
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(axis='y', linestyle='--', alpha=0.7)
# Annotate points
for x, y in zip(monthly_trends.index, monthly_trends['Avg_100_Days_HH_Ratio']):
axes[1].annotate(f'{y:.3f}', (x, y), textcoords="offset points", xytext=(0,10), ha='center', fontsize=8, color='purple')
# Comment for Plot (Avg_100_Days_HH_Ratio by Month)
"""
Purpose: To identify monthly patterns in the provision of 100 days of employment, aligned with the Indian financial year, for consistent historical data.
Insights: Reveals if certain months are better or worse for achieving this key scheme objective.
Interpretation: Seasonal peaks might align with lean agricultural seasons when demand for work is highest. Understanding these patterns can help target interventions. Annotated values provide precise figures.
"""
# Plot Avg_Payment_Efficiency by Month
sns.lineplot(x=monthly_trends.index, y='Avg_Payment_Efficiency', data=monthly_trends, marker='o', ax=axes[2], color='brown')
axes[2].set_title('Average Payment Efficiency by Month', fontsize=14)
axes[2].set_xlabel('Month', fontsize=12)
axes[2].set_ylabel('Average Percentage (%)', fontsize=12)
axes[2].tick_params(axis='x', rotation=45)
axes[2].grid(axis='y', linestyle='--', alpha=0.7)
# Annotate points
for x, y in zip(monthly_trends.index, monthly_trends['Avg_Payment_Efficiency']):
axes[2].annotate(f'{y:.1f}%', (x, y), textcoords="offset points", xytext=(0,10), ha='center', fontsize=8, color='brown')
# Comment for Plot (Avg_Payment_Efficiency by Month)
"""
Purpose: To assess monthly consistency in payment efficiency, aligned with the Indian financial year, for consistent historical data.
Insights: Can highlight months where administrative bottlenecks or external factors might cause payment delays.
Interpretation: Stable high efficiency indicates a robust system. Dips can signal areas where the payment mechanism needs strengthening, especially if it coincides with peak work demand. Annotated values provide precise figures.
"""
plt.tight_layout(rect=[0, 0.03, 1, 0.96])
plt.show()
In [ ]:
# --- New Visualization: Monthly Trends for Key Metrics by Financial Year ---
print("\n--- Monthly Trends by Financial Year for Key Metrics (2018-2024 FYs) ---") # Updated title
# Data preparation for the new plot: Group by fin_year and calendar_month
df_temp_time['calendar_month_for_plot'] = df_temp_time['month'].astype(str).str.strip().str.title().astype('category')
df_temp_time['calendar_month_for_plot'] = df_temp_time['calendar_month_for_plot'].map(short_to_full_month).fillna(df_temp_time['calendar_month_for_plot'])
df_temp_time['calendar_month_for_plot'] = pd.Categorical(df_temp_time['calendar_month_for_plot'], categories=financial_month_order_for_plot, ordered=True) # Use financial year month order
# Filter out the incomplete 2025-2026 data (already handled by df filtering)
monthly_yearly_trends_filtered = df_temp_time.groupby(['fin_year', 'calendar_month_for_plot']).agg(
Total_Expenditure=('Total_Exp', 'sum'),
Total_Workers=('Total_No_of_Workers', 'sum'),
Avg_Daily_Wage=('Average_Wage_rate_per_day_per_person', 'mean'),
Avg_100_Days_HH_Ratio=('100_Days_HH_Ratio', 'mean'),
Avg_Payment_Efficiency=('percentage_payments_gererated_within_15_days', 'mean')
).reset_index().rename(columns={'calendar_month_for_plot': 'Month'})
# Fill NaNs only for numerical columns in this aggregated DataFrame
numerical_cols_for_fill = ['Total_Expenditure', 'Total_Workers', 'Avg_Daily_Wage', 'Avg_100_Days_HH_Ratio', 'Avg_Payment_Efficiency']
for col in numerical_cols_for_fill:
if col in monthly_yearly_trends_filtered.columns:
monthly_yearly_trends_filtered[col] = monthly_yearly_trends_filtered[col].fillna(0)
# Plotting for Total Expenditure by Month for Each Year
plt.figure(figsize=(18, 10))
lineplot_exp = sns.lineplot(data=monthly_yearly_trends_filtered, x='Month', y='Total_Expenditure', hue='fin_year', marker='o', palette='tab10')
plt.title('Monthly Total Expenditure (Crores ₹) by Financial Year (2018-2024 FYs)', fontsize=16) # Updated title
plt.xlabel('Month', fontsize=12)
plt.ylabel('Total Expenditure (Crores ₹)', fontsize=12)
plt.tick_params(axis='x', rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.legend(title='Financial Year', bbox_to_anchor=(1.05, 1), loc='upper left')
# Annotate points for Monthly Total Expenditure
for line in lineplot_exp.get_lines():
for x_val, y_val in zip(line.get_xdata(), line.get_ydata()):
if pd.notna(y_val): # Ensure y_val is not NaN
plt.text(x_val, y_val, f'{y_val/1e7:.1f} Cr', fontsize=7, ha='center', va='bottom', color=line.get_color())
plt.tight_layout()
plt.show()
# Comment for Monthly Total Expenditure by Financial Year
"""
Purpose: To visualize the month-wise trends of total expenditure for each financial year. Excludes anomalous years. Annotated values provide precise figures at each node.
Insights: Helps in identifying year-specific anomalies or deviations from typical seasonal patterns. Reveals if program scale increased in certain months in specific years.
Interpretation: For example, a sudden spike in expenditure during a particular month in one year might correspond to specific project rollouts or disaster relief efforts. This granular view helps in understanding yearly operational nuances.
"""
--- Monthly Trends by Financial Year for Key Metrics (2018-2024 FYs) ---
Out[ ]:
'\nPurpose: To visualize the month-wise trends of total expenditure for each financial year. Excludes anomalous years. Annotated values provide precise figures at each node.\nInsights: Helps in identifying year-specific anomalies or deviations from typical seasonal patterns. Reveals if program scale increased in certain months in specific years.\nInterpretation: For example, a sudden spike in expenditure during a particular month in one year might correspond to specific project rollouts or disaster relief efforts. This granular view helps in understanding yearly operational nuances.\n'
In [ ]:
# Plotting for Total Workers by Month for Each Year
plt.figure(figsize=(18, 10))
lineplot_workers = sns.lineplot(data=monthly_yearly_trends_filtered, x='Month', y='Total_Workers', hue='fin_year', marker='x', palette='tab10')
plt.title('Monthly Total Workers (Millions) by Financial Year (2018-2024 FYs)', fontsize=16) # Updated title
plt.xlabel('Month', fontsize=12)
plt.ylabel('Total Workers (Millions)', fontsize=12)
plt.tick_params(axis='x', rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.legend(title='Financial Year', bbox_to_anchor=(1.05, 1), loc='upper left')
# Annotate points for Monthly Total Workers
for line in lineplot_workers.get_lines():
for x_val, y_val in zip(line.get_xdata(), line.get_ydata()):
if pd.notna(y_val): # Ensure y_val is not NaN
plt.text(x_val, y_val, f'{y_val/1e6:.1f} M', fontsize=7, ha='center', va='bottom', color=line.get_color())
plt.tight_layout()
plt.show()
# Comment for Monthly Total Workers by Financial Year
"""
Purpose: To visualize the month-wise trends of total workers employed for each financial year. Excludes anomalous years. Annotated values provide precise figures at each node.
Insights: Helps in identifying seasonal employment patterns and how they might vary year-on-year.
Interpretation: Shows how demand for work or availability of projects fluctuates monthly within each financial year. This is vital for understanding employment stability and planning resource allocation.
"""
Out[ ]:
'\nPurpose: To visualize the month-wise trends of total workers employed for each financial year. Excludes anomalous years. Annotated values provide precise figures at each node.\nInsights: Helps in identifying seasonal employment patterns and how they might vary year-on-year.\nInterpretation: Shows how demand for work or availability of projects fluctuates monthly within each financial year. This is vital for understanding employment stability and planning resource allocation.\n'
In [ ]:
# --- Geospatial Analysis (Choropleth Maps) ---
print("\n--- Starting Geospatial Analysis ---")
# Load India's state boundaries
try:
# IMPORTANT: VERIFY YOUR GEOJSON FILE PATH AND FILENAME!
if not os.path.exists(GEOJSON_FILE):
raise FileNotFoundError(f"GeoJSON file not found at: {GEOJSON_FILE}")
india_states_gdf = geopandas.read_file(GEOJSON_FILE)
print(f"Loaded GeoJSON from: {GEOJSON_FILE}")
print(f"GeoDataFrame columns: {india_states_gdf.columns.tolist()}")
# Check if the loaded GeoJSON is state-level or district-level
is_state_level_geojson = 'NAME_1' in india_states_gdf.columns and 'NAME_2' not in india_states_gdf.columns
if not is_state_level_geojson:
print("Warning: Loaded GeoJSON appears to be district-level or different structure. Attempting to dissolve to state level.")
if 'NAME_1' in india_states_gdf.columns:
# Dissolve by NAME_1 to get state boundaries from a district-level file
india_states_gdf['State_Clean_for_Dissolve'] = india_states_gdf['NAME_1'].astype(str).str.upper().str.replace('&', 'AND').str.replace('.', '').str.replace(' ', '').str.strip()
india_states_gdf = india_states_gdf.dissolve(by='State_Clean_for_Dissolve')
# After dissolve, the 'NAME_1' column might become the index or a regular column depending on geopandas version
if 'State_Clean_for_Dissolve' in india_states_gdf.index.names:
india_states_gdf = india_states_gdf.reset_index()
print("Successfully dissolved district-level GeoJSON to state boundaries.")
else:
print("Error: Cannot dissolve GeoJSON to state level. 'NAME_1' column not found. Map plotting might fail or be incorrect.")
india_states_gdf['State_Clean'] = ''
# IMPORTANT: Consistent state name cleaning for both GeoDataFrame and aggregated data
# Standardize names: remove spaces, replace '&' with 'AND', remove dots.
if 'NAME_1' in india_states_gdf.columns:
india_states_gdf['State_Clean'] = india_states_gdf['NAME_1'].astype(str).str.upper().str.replace('&', 'AND').str.replace('.', '').str.replace(' ', '').str.strip()
elif 'State_Clean_for_Dissolve' in india_states_gdf.columns:
india_states_gdf['State_Clean'] = india_states_gdf['State_Clean_for_Dissolve'].astype(str).str.upper().str.replace('&', 'AND').str.replace('.', '').str.replace(' ', '').str.strip()
elif 'NAME_LOCAL' in india_states_gdf.columns:
india_states_gdf['State_Clean'] = india_states_gdf['NAME_LOCAL'].astype(str).str.upper().str.replace('&', 'AND').str.replace('.', '').str.replace(' ', '').str.strip()
else:
print("Warning: Could not find standard state name column in GeoJSON for final cleaning. Merge might fail.")
india_states_gdf['State_Clean'] = india_states_gdf.index.astype(str).str.upper().str.replace('&', 'AND').str.replace('.', '').str.replace(' ', '').str.strip()
state_performance_reset = state_performance.reset_index()
state_performance_reset['State_Clean'] = state_performance_reset['State'].astype(str).str.upper().str.replace('&', 'AND').str.replace('.', '').str.replace(' ', '').str.strip()
# Debugging state name mismatches
mgnrega_states = set(state_performance_reset['State_Clean'].unique())
geojson_states = set(india_states_gdf['State_Clean'].unique())
print(f"\nStates in MGNREGA data: {sorted(list(mgnrega_states))}")
print(f"States in GeoJSON data: {sorted(list(geojson_states))}")
print(f"States in MGNREGA but not in GeoJSON: {mgnrega_states - geojson_states}")
print(f"States in GeoJSON but not in MGNREGA: {geojson_states - mgnrega_states}")
common_states = list(mgnrega_states.intersection(geojson_states))
if not common_states:
print("Warning: No common states found between MGNREGA data and GeoJSON. Map plotting might be empty or incorrect.")
print("This is often due to naming inconsistencies. Please review the 'States in MGNREGA data' and 'States in GeoJSON data' lists above.")
merged_gdf = india_states_gdf.merge(state_performance_reset, left_on='State_Clean', right_on='State_Clean', how='left')
for col in state_performance.columns:
if col in merged_gdf.columns:
if merged_gdf[col].isnull().any():
merged_gdf[col] = merged_gdf[col].fillna(0)
# Plot Choropleth Maps for key metrics
fig, axes = plt.subplots(3, 2, figsize=(20, 25))
fig.suptitle('Geographical Distribution of Key MGNREGA Performance Indicators by State', fontsize=20)
map_metrics = [
('Total_Expenditure', 'Total Expenditure (Crores ₹)', plt.cm.viridis),
('Total_Workers', 'Total Workers (Millions)', plt.cm.magma),
('Avg_Daily_Wage', 'Average Daily Wage (₹)', plt.cm.plasma),
('Avg_100_Days_HH_Ratio', 'Average 100-Days HH Ratio', plt.cm.coolwarm),
('Avg_Payment_Efficiency', 'Average Payment Efficiency (%)', plt.cm.OrRd),
('Avg_Women_Persondays_Ratio', 'Average Women Persondays Ratio', plt.cm.viridis)
]
axes = axes.flatten()
for i, (metric, title, cmap) in enumerate(map_metrics):
ax = axes[i]
plot_data = merged_gdf[metric]
if 'Expenditure' in title:
plot_data = plot_data / 1e7
elif 'Workers' in title:
plot_data = plot_data / 1e6
vmin, vmax = None, None
if 'Ratio' in title or 'Percentage' in title:
vmin = 0
vmax = 1.0 if 'Ratio' in title else 100.0
# Ensure plot_data has non-NaN values for mapping, fill with 0 if needed for states with no data
plot_data_for_map = plot_data.fillna(0) # Temporarily fill for plotting color, missing_kwds still takes over for legend
merged_gdf.plot(column=plot_data_for_map, cmap=cmap, linewidth=0.8, ax=ax, edgecolor='0.8', legend=True,
legend_kwds={'label': title, 'orientation': "horizontal", 'shrink': 0.7},
missing_kwds={"color": "lightgrey", "label": "No Data"},
vmin=vmin, vmax=vmax
)
ax.set_title(title, fontsize=16)
ax.set_axis_off()
# Add state names as text labels (selectively for larger states for clarity)
# Use merged_gdf['State'] for original state names if available
for idx, row in merged_gdf.iterrows():
# Only label states that have data and are large enough for the label to be clear
# Filter for larger states or states with more central geometry to avoid clutter
if pd.notna(row[metric]) and row[metric] > 0 and row['State_Clean'] in common_states:
# Heuristic for selective labeling on India map to prevent clutter
if row['State_Clean'] in ['UTTARPRADESH', 'MADHYAPRADESH', 'RAJASTHAN', 'MAHARASHTRA',
'ANDHRAPRADESH', 'KARNATAKA', 'GUJARAT', 'ODISHA',
'CHHATTISGARH', 'WESTBENGAL', 'BIHAR', 'ASSAM', 'KERALA',
'PUNJAB', 'HARYANA', 'JHARKHAND', 'UTTARAKHAND']:
try:
centroid = row.geometry.centroid
# Adjust text properties based on expected background color from colormap or general contrast
# For viridis/magma/plasma, black text is generally good. For OrRd, black/dark color needed.
ax.text(centroid.x, centroid.y, row['State'], fontsize=7, ha='center', va='center', color='black', alpha=0.9,
bbox=dict(boxstyle="round,pad=0.1", fc='white', ec='none', alpha=0.5)) # Add bbox for readability
except Exception as e:
print(f"Could not add label for {row['State']}: {e}")
plt.tight_layout(rect=[0, 0.03, 1, 0.98])
plt.show()
print("\n--- Geospatial Analysis Completed ---")
except FileNotFoundError:
print(f"Error: GeoJSON file '{GEOJSON_FILE}' not found. Skipping geospatial analysis.")
print("Please ensure you have the GeoJSON file for Indian administrative boundaries in the specified path.")
print("Example: ML_CA1/Datasets/GeoJSON/gadm41_IND_1.json (for states) or gadm41_IND_2.json (if you want to dissolve districts to states).")
except Exception as e:
print(f"An unexpected error occurred during geospatial analysis: {e}")
print("Skipping geospatial analysis. Error details:")
import traceback
traceback.print_exc()
--- Starting Geospatial Analysis ---
Loaded GeoJSON from: Datasets\GEOJSON\gadm41_IND_1.json
GeoDataFrame columns: ['GID_1', 'GID_0', 'COUNTRY', 'NAME_1', 'VARNAME_1', 'NL_NAME_1', 'TYPE_1', 'ENGTYPE_1', 'CC_1', 'HASC_1', 'ISO_1', 'geometry']
States in MGNREGA data: ['ANDAMANANDNICOBAR', 'ANDHRAPRADESH', 'ARUNACHALPRADESH', 'ASSAM', 'BIHAR', 'CHHATTISGARH', 'DNHAVELIANDDD', 'GOA', 'GUJARAT', 'HARYANA', 'HIMACHALPRADESH', 'JAMMUANDKASHMIR', 'JHARKHAND', 'KARNATAKA', 'KERALA', 'LADAKH', 'LAKSHADWEEP', 'MADHYAPRADESH', 'MAHARASHTRA', 'MANIPUR', 'MEGHALAYA', 'MIZORAM', 'NAGALAND', 'ODISHA', 'PUDUCHERRY', 'PUNJAB', 'RAJASTHAN', 'SIKKIM', 'TAMILNADU', 'TELANGANA', 'TRIPURA', 'UTTARAKHAND', 'UTTARPRADESH', 'WESTBENGAL']
States in GeoJSON data: ['ANDAMANANDNICOBAR', 'ANDHRAPRADESH', 'ARUNACHALPRADESH', 'ASSAM', 'BIHAR', 'CHANDIGARH', 'CHHATTISGARH', 'DADRAANDNAGARHAVELI', 'DAMANANDDIU', 'GOA', 'GUJARAT', 'HARYANA', 'HIMACHALPRADESH', 'JAMMUANDKASHMIR', 'JHARKHAND', 'KARNATAKA', 'KERALA', 'LAKSHADWEEP', 'MADHYAPRADESH', 'MAHARASHTRA', 'MANIPUR', 'MEGHALAYA', 'MIZORAM', 'NAGALAND', 'NCTOFDELHI', 'ODISHA', 'PUDUCHERRY', 'PUNJAB', 'RAJASTHAN', 'SIKKIM', 'TAMILNADU', 'TELANGANA', 'TRIPURA', 'UTTARAKHAND', 'UTTARPRADESH', 'WESTBENGAL']
States in MGNREGA but not in GeoJSON: {'LADAKH', 'DNHAVELIANDDD'}
States in GeoJSON but not in MGNREGA: {'DADRAANDNAGARHAVELI', 'DAMANANDDIU', 'NCTOFDELHI', 'CHANDIGARH'}
--- Geospatial Analysis Completed ---
In [ ]:
# --- Bivariate Analysis: Correlation Matrix ---
print("\n--- Correlation Matrix of Key Numerical Metrics ---")
correlation_cols = [
'Approved_Labour_Budget', 'Total_Exp', 'Wages', 'Persondays_of_Central_Liability_so_far',
'Total_No_of_Workers', 'Total_Households_Worked', 'Total_Individuals_Worked',
'Average_Wage_rate_per_day_per_person', 'percentage_payments_gererated_within_15_days',
'Women_Persondays_Ratio', 'SC_Persondays_Ratio', 'ST_Persondays_Ratio',
'100_Days_HH_Ratio', 'percent_of_NRM_Expenditure', 'percent_of_Expenditure_on_Agriculture_Allied_Works'
]
correlation_matrix = df[correlation_cols].corr()
plt.figure(figsize=(14, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix of Key MGNREGA Metrics', fontsize=16)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()
# Comment for Correlation Matrix
"""
Purpose: To quantitatively assess the linear relationships between various key numerical metrics.
Insights: High correlation (close to +1 or -1) indicates that two variables tend to move together or in opposite directions.
Interpretation: For instance, a strong positive correlation between 'Total_Exp' and 'Total_Workers' would indicate that higher spending leads to more employment. This helps identify interdependencies and potential drivers for different aspects of the scheme. Weak correlations might suggest that metrics are independent, or that non-linear relationships exist.
"""
print("\n--- EDA Step 3 Completed. Dataset is ready for in-depth analysis and insights. ---")
--- Correlation Matrix of Key Numerical Metrics ---
--- EDA Step 3 Completed. Dataset is ready for in-depth analysis and insights. ---
In [1]:
# import pandas as pd
# import numpy as np
# import os
# import matplotlib.pyplot as plt
# import seaborn as sns
# import geopandas # Import geopandas for geospatial analysis
# from matplotlib.offsetbox import AnnotationBbox, OffsetImage # For image annotations on maps
# # Suppress warnings for cleaner output
# import warnings
# warnings.filterwarnings('ignore')
# # --- Configuration for file paths ---
# PROJECT_ROOT = ""
# CLEANED_DATA_DIR = os.path.join(PROJECT_ROOT, "Datasets", "Cleaned_Preprocessed")
# FULLY_CLEANED_CSV_FILE = os.path.join(CLEANED_DATA_DIR, "mgnrega_data_fully_cleaned.csv")
# # IMPORTANT: Ensure this GeoJSON file exists in your project structure:
# # ML_CA1/Datasets/GeoJSON/gadm41_IND_1.json (for states) is preferred.
# # If you use gadm41_IND_2.json (district level), we will dissolve it to state level.
# GEOJSON_FILE = os.path.join(PROJECT_ROOT, "Datasets", "GEOJSON", "gadm41_IND_1.json") # Prefer state-level GeoJSON
# # Define the path for the final cleaned, sorted, and FILTERED CSV
# FINAL_FILTERED_SORTED_CSV_FILE = os.path.join(CLEANED_DATA_DIR, "mgnrega_data_filtered_for_analysis.csv")
# # Load the fully cleaned dataset with robust data handling
# try:
# df = pd.read_csv(FULLY_CLEANED_CSV_FILE)
# # --- Robust Numerical Column Re-conversion AFTER CSV Load ---
# # Ensure all numerical columns are correctly typed and handle inf/NaN.
# numerical_cols_to_reconvert = [
# 'Approved_Labour_Budget', 'Average_Wage_rate_per_day_per_person',
# 'Average_days_of_employment_provided_per_Household', 'Differently_abled_persons_worked',
# 'Material_and_skilled_Wages', 'Number_of_Completed_Works',
# 'Number_of_GPs_with_NIL_exp', 'Number_of_Ongoing_Works',
# 'Persondays_of_Central_Liability_so_far', 'SC_persondays',
# 'SC_workers_against_active_workers', 'ST_persondays',
# 'ST_workers_against_active_workers', 'Total_Adm_Expenditure', 'Total_Exp',
# 'Total_Households_Worked', 'Total_Individuals_Worked',
# 'Total_No_of_Active_Job_Cards', 'Total_No_of_Active_Workers',
# 'Total_No_of_HHs_completed_100_Days_of_Wage_Employment',
# 'Total_No_of_JobCards_issued', 'Total_No_of_Workers',
# 'Total_No_of_Works_Takenup', 'Wages', 'Women_Persondays',
# 'percent_of_Category_B_Works', 'percent_of_Expenditure_on_Agriculture_Allied_Works',
# 'percent_of_NRM_Expenditure', 'percentage_payments_gererated_within_15_days'
# ]
# for col in numerical_cols_to_reconvert:
# if col in df.columns:
# df[col] = pd.to_numeric(df[col], errors='coerce')
# df[col] = df[col].replace([np.inf, -np.inf], np.nan).fillna(0) # Handle inf/NaN with 0
# df['state_code'] = df['state_code'].fillna(-1).astype(int)
# df['district_code'] = df['district_code'].fillna(-1).astype(int)
# # --- NO GLOBAL DATE COLUMN CREATION (as discussed, will derive as needed) ---
# df['State'] = df['State'].astype('category')
# df['District'] = df['District'].astype('category')
# print(f"Loaded fully cleaned data from: {FULLY_CLEANED_CSV_FILE}")
# print(f"DataFrame shape after robust numerical handling: {df.shape}")
# # --- Filter out 2024-2025 and 2025-2026 for consistent historical analysis (NEW) ---
# print("\n--- Filtering Data for Consistent Historical Analysis (Excluding 2024-2025 and 2025-2026) ---")
# years_to_exclude = ['2024-2025', '2025-2026']
# df = df[~df['fin_year'].isin(years_to_exclude)].copy() # Filter the main DataFrame
# print(f"DataFrame shape after filtering {years_to_exclude}: {df.shape}")
# print(f"Financial years remaining in data: {df['fin_year'].unique()}")
# # --- Data Sorting ---
# print("\n--- Sorting DataFrame by Financial Year, Month (Financial Order), State Code, and District Code ---")
# # 1. Define month order for Indian Financial Year (April to March)
# financial_month_order = ['April', 'May', 'June', 'July', 'August', 'September',
# 'October', 'November', 'December', 'January', 'February', 'March']
# # Create a more robust month_to_num mapping to handle both full and short month names
# # This dictionary is mainly for converting to numerical keys for comparison/sorting if needed,
# # but the primary sorting will leverage CategoricalDtype
# month_to_num_for_plotting = { # Use a different name to avoid confusion with sorting key
# 'April': 0, 'May': 1, 'June': 2, 'July': 3, 'August': 4, 'September': 5,
# 'October': 6, 'November': 7, 'December': 8, 'January': 9, 'February': 10, 'March': 11,
# 'Apr': 0, 'May': 1, 'Jun': 2, 'Jul': 3, 'Aug': 4, 'Sep': 5,
# 'Oct': 6, 'Nov': 7, 'Dec': 8, 'Jan': 9, 'Feb': 10, 'Mar': 11
# }
# # 2. Create numerical sorting key for fin_year
# df['fin_year_sort_key'] = df['fin_year'].astype(str).str.split('-').str[0].astype(int)
# # 3. Process 'month' column and convert to an ORDERED CategoricalDtype
# df['month_processed'] = df['month'].astype(str).str.strip().str.title()
# df['month_full_name_for_sort'] = df['month_processed'].apply(
# lambda x: next((m_full for m_full in financial_month_order if m_full.startswith(x)), x)
# )
# df['month_ordered'] = pd.Categorical(df['month_full_name_for_sort'], categories=financial_month_order, ordered=True)
# # 4. Sort the DataFrame using the custom ordered 'month_ordered'
# df.sort_values(by=['fin_year_sort_key', 'month_ordered', 'state_code', 'district_code'], inplace=True)
# # 5. Drop temporary sorting keys
# df.drop(columns=['fin_year_sort_key', 'month_processed', 'month_full_name_for_sort', 'month_ordered'], inplace=True, errors='ignore')
# print("DataFrame sorted successfully.")
# print(df[['fin_year', 'month', 'state_code', 'State', 'district_code', 'District']].head(20))
# # --- Save the fully cleaned, sorted, and FILTERED DataFrame (UPDATED PATH) ---
# df.to_csv(FINAL_FILTERED_SORTED_CSV_FILE, index=False)
# print(f"\nFiltered data for analysis saved to: {FINAL_FILTERED_SORTED_CSV_FILE}")
# except FileNotFoundError:
# print(f"Error: The file '{FULLY_CLEANED_CSV_FILE}' was not found. Please ensure the path is correct.")
# exit()
# except Exception as e:
# print(f"An unexpected error occurred while loading the dataset: {e}")
# exit()
# print("\n--- Starting EDA Step 3: Refined Visualizations and Deep Dive into Trends and Patterns ---")
# # --- Feature Engineering for deeper insights (re-run to ensure consistency) ---
# df['Women_Persondays_Ratio'] = df['Women_Persondays'] / df['Persondays_of_Central_Liability_so_far']
# df['Women_Persondays_Ratio'] = df['Women_Persondays_Ratio'].replace([np.inf, -np.inf], np.nan).fillna(0)
# df['Women_Persondays_Ratio'] = np.clip(df['Women_Persondays_Ratio'], 0, 1)
# df['SC_Persondays_Ratio'] = df['SC_persondays'] / df['Persondays_of_Central_Liability_so_far']
# df['SC_Persondays_Ratio'] = df['SC_Persondays_Ratio'].replace([np.inf, -np.inf], np.nan).fillna(0)
# df['SC_Persondays_Ratio'] = np.clip(df['SC_Persondays_Ratio'], 0, 1)
# df['ST_Persondays_Ratio'] = df['ST_persondays'] / df['Persondays_of_Central_Liability_so_far']
# df['ST_Persondays_Ratio'] = df['ST_Persondays_Ratio'].replace([np.inf, -np.inf], np.nan).fillna(0)
# df['ST_Persondays_Ratio'] = np.clip(df['ST_Persondays_Ratio'], 0, 1)
# df['100_Days_HH_Ratio'] = df['Total_No_of_HHs_completed_100_Days_of_Wage_Employment'] / df['Total_Households_Worked']
# df['100_Days_HH_Ratio'] = df['100_Days_HH_Ratio'].replace([np.inf, -np.inf], np.nan).fillna(0)
# df['100_Days_HH_Ratio'] = np.clip(df['100_Days_HH_Ratio'], 0, 1)
# print("\n--- Feature Engineering Completed ---")
# # --- Analysis by State (Visualizations with refined plots and explicit hue) ---
# print("\n--- State-wise Aggregated Metrics ---")
# # Aggregate all state performance metrics
# state_performance = df.groupby('State').agg(
# Total_Expenditure=('Total_Exp', 'sum'),
# Total_Workers=('Total_No_of_Workers', 'sum'),
# Avg_Daily_Wage=('Average_Wage_rate_per_day_per_person', 'mean'),
# Avg_100_Days_HH_Ratio=('100_Days_HH_Ratio', 'mean'),
# Avg_Payment_Efficiency=('percentage_payments_gererated_within_15_days', 'mean'),
# Total_Women_Persondays=('Women_Persondays', 'sum'),
# Avg_Women_Persondays_Ratio=('Women_Persondays_Ratio', 'mean'),
# Avg_NRM_Exp_Percent=('percent_of_NRM_Expenditure', 'mean')
# )
# print("\n--- State Performance Aggregated Data (Head) ---")
# print(state_performance.head())
# print("\n--- State Performance Aggregated Data (Describe) ---")
# print(state_performance.describe())
# # Visualizations: Bar Charts for State Performance - Now showing all states, sorted by value, with value labels
# fig, axes = plt.subplots(3, 2, figsize=(20, 25))
# fig.suptitle('State-wise Key MGNREGA Performance Indicators', fontsize=22) # Changed title
# plt.subplots_adjust(left=0.1, right=0.9, top=0.92, bottom=0.08, hspace=0.4, wspace=0.3)
# # Helper function to plot bar charts for state performance with values
# def plot_state_bar_chart_with_values(ax, data, x_col, y_col, title, x_label, y_label, formatter=None, palette='viridis', ascending=False):
# # Sort data for plotting (highest values at the top)
# sorted_data = data.sort_values(by=x_col, ascending=ascending).reset_index()
# if not sorted_data.empty and not sorted_data[x_col].isnull().all() and not (sorted_data[x_col] == 0).all():
# bars = sns.barplot(x=x_col, y=y_col, data=sorted_data, ax=ax, palette=palette)
# ax.set_title(title, fontsize=16)
# ax.set_xlabel(x_label, fontsize=13)
# ax.set_ylabel(y_label, fontsize=13)
# if formatter:
# ax.xaxis.set_major_formatter(plt.FuncFormatter(formatter))
# ax.tick_params(axis='x', labelsize=11)
# ax.tick_params(axis='y', labelsize=11)
# ax.grid(axis='x', linestyle='--', alpha=0.7)
# # Annotate bars with values
# for container in ax.containers: # Iterate through containers for bar_label
# for bar in container.patches: # Iterate through individual bars
# x_val = bar.get_width()
# y_val = bar.get_y() + bar.get_height() / 2
# # Determine text color based on background luminance (simple heuristic)
# text_color = 'black' # Default text color
# if formatter: # Use formatted value to check
# formatted_val = formatter(x_val, None)
# if 'Cr' in formatted_val or 'M' in formatted_val:
# # For large monetary/worker values, assume darker colors or high variance, use white
# if x_val > ax.get_xlim()[1] * 0.5: # If bar is past half way, use white
# text_color = 'white'
# elif 'Ratio' in formatted_val or '%' in formatted_val:
# # For percentages/ratios, if value is high, bar is dark, use white
# if x_val > (ax.get_xlim()[1] * 0.6 if ax.get_xlim()[1] > 1 else ax.get_xlim()[1] * 0.6): # Adjusted for ratio scale
# text_color = 'white'
# ax.text(x_val, y_val, formatted_val if formatter else f'{x_val:.0f}',
# color=text_color, ha='right', va='center', fontsize=8, weight='bold',
# bbox=dict(boxstyle="round,pad=0.1", fc='white', ec='none', alpha=0.6)) # Add bbox for readability, slightly transparent
# # --- Dynamic xlim adjustment for better visibility ---
# if sorted_data[x_col].max() > 0:
# current_max_x = sorted_data[x_col].max()
# if 'Cr' in x_label:
# ax.set_xlim(left=0, right=current_max_x * 1.2) # Increased right margin for labels
# elif 'M' in x_label:
# ax.set_xlim(left=0, right=current_max_x * 1.2)
# elif 'Ratio' in x_label:
# ax.set_xlim(left=0, right=1.0)
# elif 'Percentage' in x_label:
# ax.set_xlim(left=0, right=100.0)
# elif 'Wage' in x_label:
# ax.set_xlim(left=0, right=max(current_max_x * 1.2, 850))
# else:
# ax.set_xlim(left=0, right=current_max_x * 1.2)
# else:
# ax.set_title(f"No meaningful data for {title.split('by ')[1]}", fontsize=16)
# ax.text(0.5, 0.5, 'Data not available or all zeros', horizontalalignment='center', verticalalignment='center', transform=ax.transAxes, fontsize=14, color='red')
# ax.set_visible(True)
# ax.set_xticks([])
# ax.set_yticks([])
# # Plot 1: States by Total Expenditure (₹)
# plot_state_bar_chart_with_values(axes[0, 0], state_performance,
# 'Total_Expenditure', 'State',
# 'States by Total Expenditure (₹)', 'Total Expenditure', 'State',
# lambda x, _: f'{x/1e7:.1f} Cr', 'viridis', ascending=True)
# # Comment for Plot 1
# """
# Purpose: To visualize the total financial investment in MGNREGA across all states.
# Insights: Identifies states with the largest and smallest overall expenditure.
# Interpretation: Highlights states with substantial program scale. Large disparities suggest varying levels of program implementation or regional demands for work. The bar labels provide exact figures, aiding precise comparison.
# """
# # Plot 2: States by Total Workers Employed
# plot_state_bar_chart_with_values(axes[0, 1], state_performance,
# 'Total_Workers', 'State', 'States by Total Workers Employed',
# 'Total Workers', 'State', lambda x, _: f'{x/1e6:.1f} M', 'magma', ascending=True)
# # Comment for Plot 2
# """
# Purpose: To visualize the total number of workers engaged in MGNREGA activities across all states.
# Insights: Shows the geographical reach of the scheme in terms of human resource engagement.
# Interpretation: High worker numbers are critical for livelihood support. These figures are cumulative over time, not unique individuals, indicating the volume of employment generated. Labels offer precise worker counts.
# """
# # Plot 3: States by Average Daily Wage (₹)
# plot_state_bar_chart_with_values(axes[1, 0], state_performance,
# 'Avg_Daily_Wage', 'State', 'States by Average Daily Wage (₹)',
# 'Average Daily Wage (₹)', 'State', palette='plasma', ascending=True)
# # Comment for Plot 3
# """
# Purpose: To compare the average daily wage rates offered by MGNREGA across different states.
# Insights: Reveals regional variations in compensation for work under the scheme.
# Interpretation: Differences in average wages can stem from variations in state-specific minimum wages or the nature of work (skilled vs. unskilled). Higher wages improve worker earnings, while lower wages might indicate areas needing policy review to ensure fair compensation.
# """
# # Plot 4: States by Average 100-Days HH Ratio
# plot_state_bar_chart_with_values(axes[1, 1], state_performance,
# 'Avg_100_Days_HH_Ratio', 'State', 'States by Average 100-Days HH Ratio',
# 'Average Ratio', 'State', palette='coolwarm', ascending=True)
# # Comment for Plot 4
# """
# Purpose: To assess the proportion of households completing the guaranteed 100 days of employment across states.
# Insights: A key indicator of the scheme's effectiveness in providing sustained livelihood support.
# Interpretation: Very low ratios across all states suggest a significant gap between the scheme's intent and its execution on the ground. States with higher ratios are more successful in this core objective. This gap could be due to demand-side issues (workers not seeking 100 days) or supply-side issues (lack of available work, administrative hurdles).
# """
# # Plot 5: States by Average Payment Efficiency (%)
# plot_state_bar_chart_with_values(axes[2, 0], state_performance,
# 'Avg_Payment_Efficiency', 'State', 'States by Average Payment Efficiency (%)',
# 'Average Percentage (%)', 'State', palette='OrRd', ascending=True)
# # Comment for Plot 5
# """
# Purpose: To visualize states' administrative efficiency in disbursing wages within the stipulated 15 days.
# Insights: Highlights states with high and low adherence to timely payment.
# Interpretation: High payment efficiency is crucial for worker welfare, reducing financial stress and increasing trust in the scheme. Low efficiency in some states indicates administrative bottlenecks that require urgent attention.
# """
# # Plot 6: States by Average Women Persondays Ratio
# plot_state_bar_chart_with_values(axes[2, 1], state_performance,
# 'Avg_Women_Persondays_Ratio', 'State', 'States by Average Women Persondays Ratio',
# 'Average Ratio', 'State', palette='viridis', ascending=True)
# # Comment for Plot 6
# """
# Purpose: To assess gender inclusivity and the proportion of employment generated for women across states.
# Insights: Reveals states that are more successful in mobilizing and providing work for women.
# Interpretation: Higher ratios signify better gender equity in MGNREGA implementation. This is a vital social equity metric, indicating successful efforts in empowering women in the rural workforce. Lower ratios might suggest cultural barriers or lack of suitable work for women in those regions.
# """
# plt.tight_layout(rect=[0, 0.03, 1, 0.96])
# plt.show()
# # --- Analysis by Time (Financial Year and Month) ---
# print("\n--- Trends Over Financial Years ---")
# # --- Deriving calendar_year and calendar_month on-the-fly for time series ---
# # This ensures we are not relying on a globally created 'Date' column and its potential issues
# # Updated month_order to reflect Indian financial year starting April
# financial_month_order_for_plot = ['April', 'May', 'June', 'July', 'August', 'September',
# 'October', 'November', 'December', 'January', 'February', 'March']
# short_to_full_month_map = {
# 'Jan': 'January', 'Feb': 'February', 'Mar': 'March', 'Apr': 'April',
# 'May': 'May', 'Jun': 'June', 'Jul': 'July', 'Aug': 'August',
# 'Sep': 'September', 'Oct': 'October', 'Nov': 'November', 'Dec': 'December'
# }
# month_to_num_for_fy = {month: i for i, month in enumerate(financial_month_order_for_plot)}
# # Create a temporary DataFrame for time series analysis
# df_temp_time = df.copy() # df is already filtered now
# # Ensure 'month' column is clean before mapping
# df_temp_time['month_cleaned_for_temp'] = df_temp_time['month'].astype(str).str.strip().str.title()
# # Map short names to full names, and then get numerical order
# df_temp_time['month_full_name'] = df_temp_time['month_cleaned_for_temp'].map(short_to_full_month_map).fillna(df_temp_time['month_cleaned_for_temp'])
# df_temp_time['month_num_fy_order'] = df_temp_time['month_full_name'].map(month_to_num_for_fy)
# df_temp_time['start_year_temp'] = df_temp_time['fin_year'].astype(str).str.split('-').str[0].astype(int)
# # Corrected get_calendar_year_for_temp logic for Indian financial year (April-March)
# def get_calendar_year_for_temp(row):
# start_year = row['start_year_temp']
# month_num_fy_order = row['month_num_fy_order'] # Using the order number for current year determination
# if pd.isna(start_year) or pd.isna(month_num_fy_order):
# return np.nan
# # If month is April-Dec (0-8 in financial_month_order_for_plot index), it's the start year
# # If month is Jan-Mar (9-11 in financial_month_order_for_plot index), it's start_year + 1
# if month_num_fy_order >= month_to_num_for_fy['April']: # Months from April to December (index 0 to 8)
# return int(start_year)
# else: # Months January to March (index 9 to 11)
# return int(start_year + 1)
# df_temp_time['calendar_year_temp'] = df_temp_time.apply(get_calendar_year_for_temp, axis=1)
# # Drop rows with NaNs in necessary temporal columns
# df_temp_time.dropna(subset=['month_num_fy_order', 'calendar_year_temp'], inplace=True)
# df_temp_time['calendar_year_temp'] = df_temp_time['calendar_year_temp'].astype(int)
# # Filter for yearly trends (no further filtering needed, df is already filtered)
# yearly_trends = df_temp_time.groupby('fin_year').agg(
# Total_Expenditure=('Total_Exp', 'sum'),
# Total_Workers=('Total_No_of_Workers', 'sum'),
# Avg_Daily_Wage=('Average_Wage_rate_per_day_per_person', 'mean'),
# Avg_100_Days_HH_Ratio=('100_Days_HH_Ratio', 'mean'),
# Avg_Payment_Efficiency=('percentage_payments_gererated_within_15_days', 'mean')
# ).sort_values(by='fin_year')
# print("Yearly Trends (Operating on filtered data):")
# print(yearly_trends)
# # Refined Yearly Trends Plot: Using two separate line plots for clarity
# fig, ax1 = plt.subplots(figsize=(15, 8))
# # Plot Total Expenditure
# color = 'tab:blue'
# ax1.set_xlabel('Financial Year', fontsize=12)
# ax1.set_ylabel('Total Expenditure (Crores ₹)', color=color, fontsize=12)
# line1 = ax1.plot(yearly_trends.index, yearly_trends['Total_Expenditure'] / 1e7, color=color, marker='o', label='Total Expenditure')
# ax1.tick_params(axis='y', labelcolor=color)
# ax1.tick_params(axis='x', rotation=45)
# ax1.grid(axis='y', linestyle='--', alpha=0.7)
# # Annotate points for Total Expenditure
# for x, y in zip(yearly_trends.index, yearly_trends['Total_Expenditure'] / 1e7):
# ax1.annotate(f'{y:.1f} Cr', (x, y), textcoords="offset points", xytext=(0,10), ha='center', fontsize=8, color=color)
# # Plot Total Workers on a twin axis
# ax2 = ax1.twinx()
# color = 'tab:orange'
# ax2.set_ylabel('Total Workers (Millions)', color=color, fontsize=12)
# line2 = ax2.plot(yearly_trends.index, yearly_trends['Total_Workers'] / 1e6, color=color, marker='x', label='Total Workers')
# ax2.tick_params(axis='y', labelcolor=color)
# # Annotate points for Total Workers
# for x, y in zip(yearly_trends.index, yearly_trends['Total_Workers'] / 1e6):
# ax2.annotate(f'{y:.1f} M', (x, y), textcoords="offset points", xytext=(0,-15), ha='center', fontsize=8, color=color)
# fig.suptitle('Yearly Trends: Total Expenditure and Workers Employed (2018-2024)', fontsize=16) # Updated title
# # Add a combined legend from both axes
# lines_combined = line1 + line2
# labels_combined = [l.get_label() for l in lines_combined]
# ax2.legend(lines_combined, labels_combined, loc='upper left', bbox_to_anchor=(0.05, 0.95))
# fig.tight_layout()
# plt.show()
# # Comment for Yearly Trends: Total Expenditure and Workers Employed
# """
# Purpose: To visualize the overall temporal evolution of financial investment and human resource utilization under MGNREGA for historically consistent years.
# Insights: Reveals major spikes or dips in activity for comparable periods. Excludes 2024-2025 and 2025-2026 for accurate trend representation due to data granularity and completeness issues.
# Interpretation: A generally increasing trend over years indicates program expansion or increased demand. Consistent trend analysis is now possible. Scaling to crores and millions helps in readability for large numbers. Annotated values provide precise figures for each year.
# """
# # Separate plots for other yearly trends for clarity
# fig, axes = plt.subplots(1, 3, figsize=(20, 6))
# fig.suptitle('Yearly Trends: Average Wages, 100-Days HH Ratio, and Payment Efficiency (2018-2024)', fontsize=16) # Updated title
# # Plot Avg_Daily_Wage by Year
# sns.lineplot(x=yearly_trends.index, y='Avg_Daily_Wage', data=yearly_trends, marker='o', ax=axes[0], color='green')
# axes[0].set_title('Average Daily Wage Rate by Year', fontsize=14)
# axes[0].set_xlabel('Financial Year', fontsize=12)
# axes[0].set_ylabel('Average Daily Wage (₹)', fontsize=12)
# axes[0].tick_params(axis='x', rotation=45)
# axes[0].grid(axis='y', linestyle='--', alpha=0.7)
# # Annotate points
# for x, y in zip(yearly_trends.index, yearly_trends['Avg_Daily_Wage']):
# axes[0].annotate(f'{y:.0f}', (x, y), textcoords="offset points", xytext=(0,10), ha='center', fontsize=8, color='green')
# # Comment for Plot (Avg_Daily_Wage)
# """
# Purpose: To track the trend of average daily wages provided under MGNREGA over time for consistent years.
# Insights: Shows whether worker income is increasing or stagnating.
# Interpretation: A consistent upward trend is a positive sign for improving livelihood standards under the scheme. Any stagnation or decrease would warrant concern and policy review. Excludes anomalous years. Annotated values provide precise figures.
# """
# # Plot Avg_100_Days_HH_Ratio by Year
# sns.lineplot(x=yearly_trends.index, y='Avg_100_Days_HH_Ratio', data=yearly_trends, marker='o', ax=axes[1], color='purple')
# axes[1].set_title('Average 100-Days HH Ratio by Year', fontsize=14)
# axes[1].set_xlabel('Financial Year', fontsize=12)
# axes[1].set_ylabel('Average Ratio', fontsize=12)
# axes[1].tick_params(axis='x', rotation=45)
# axes[1].grid(axis='y', linestyle='--', alpha=0.7)
# # Annotate points
# for x, y in zip(yearly_trends.index, yearly_trends['Avg_100_Days_HH_Ratio']):
# axes[1].annotate(f'{y:.3f}', (x, y), textcoords="offset points", xytext=(0,10), ha='center', fontsize=8, color='purple')
# # Comment for Plot (Avg_100_Days_HH_Ratio)
# """
# Purpose: To understand the longitudinal effectiveness of providing the full 100 days of guaranteed work for consistent years.
# Insights: Reveals the scheme's success in delivering its core promise. Low or declining trends highlight challenges.
# Interpretation: Observed trends in this ratio are critical for policy attention. Possible reasons for fluctuations include demand for work, administrative bottlenecks, or policy shifts. Annotated values provide precise figures.
# """
# # Plot Avg_Payment_Efficiency by Year
# sns.lineplot(x=yearly_trends.index, y='Avg_Payment_Efficiency', data=yearly_trends, marker='o', ax=axes[2], color='brown')
# axes[2].set_title('Average Payment Efficiency by Year', fontsize=14)
# axes[2].set_xlabel('Financial Year', fontsize=12)
# axes[2].set_ylabel('Average Percentage (%)', fontsize=12)
# axes[2].tick_params(axis='x', rotation=45)
# axes[2].grid(axis='y', linestyle='--', alpha=0.7)
# # Annotate points
# for x, y in zip(yearly_trends.index, yearly_trends['Avg_Payment_Efficiency']):
# axes[2].annotate(f'{y:.1f}%', (x, y), textcoords="offset points", xytext=(0,10), ha='center', fontsize=8, color='brown')
# # Comment for Plot (Avg_Payment_Efficiency)
# """
# Purpose: To monitor the efficiency of wage disbursement over financial years for consistent years.
# Insights: Timely payments are crucial for worker welfare. A high and stable percentage is ideal.
# Interpretation: Observed efficiency trends are critical for understanding administrative performance. Any dips warrant investigation, as payment delays can severely impact the financial stability of workers. Annotated values provide precise figures.
# """
# plt.tight_layout(rect=[0, 0.03, 1, 0.96])
# plt.show()
# # Monthly trends (considering all years for seasonality)
# # df is already filtered, so df_temp_time will also be filtered
# df_filtered_for_monthly = df_temp_time.copy()
# df_filtered_for_monthly.dropna(subset=['month_num_fy_order', 'calendar_year_temp'], inplace=True)
# # Debugging the content of df_filtered_for_monthly just before monthly aggregation
# print("\n--- Debugging df_filtered_for_monthly before Monthly Aggregation ---")
# print(f"Shape: {df_filtered_for_monthly.shape}")
# print(f"month_num_fy_order nulls: {df_filtered_for_monthly['month_num_fy_order'].isnull().sum()}")
# print(f"calendar_year_temp nulls: {df_filtered_for_monthly['calendar_year_temp'].isnull().sum()}")
# print(f"Financial years present: {df_filtered_for_monthly['fin_year'].unique()}")
# print(f"Sample of data for monthly trends:\n{df_filtered_for_monthly[['fin_year', 'month', 'Total_Exp', 'Total_No_of_Workers', 'Average_Wage_rate_per_day_per_person']].head()}")
# print(f"Sum of Total_Exp in df_filtered_for_monthly: {df_filtered_for_monthly['Total_Exp'].sum()}")
# print(f"Sum of Total_No_of_Workers in df_filtered_for_monthly: {df_filtered_for_monthly['Total_No_of_Workers'].sum()}")
# df_filtered_for_monthly['calendar_month'] = df_filtered_for_monthly['month'].astype(str).str.strip().str.title().astype('category')
# # Updated month_order to reflect Indian financial year starting April
# month_order_financial_year = ['April', 'May', 'June', 'July', 'August', 'September',
# 'October', 'November', 'December', 'January', 'February', 'March']
# # Re-map short month names to full for consistent ordering and mapping
# short_to_full_month = {
# 'Jan': 'January', 'Feb': 'February', 'Mar': 'March', 'Apr': 'April',
# 'May': 'May', 'Jun': 'June', 'Jul': 'July', 'Aug': 'August',
# 'Sep': 'September', 'Oct': 'October', 'Nov': 'November', 'Dec': 'December'
# }
# df_filtered_for_monthly['calendar_month'] = df_filtered_for_monthly['calendar_month'].map(short_to_full_month).fillna(df_filtered_for_monthly['calendar_month']) # Map short to full, keep full as is
# df_filtered_for_monthly['calendar_month'] = pd.Categorical(df_filtered_for_monthly['calendar_month'], categories=month_order_financial_year, ordered=True) # Use new financial year month order
# monthly_trends = df_filtered_for_monthly.groupby('calendar_month').agg(
# Total_Expenditure=('Total_Exp', 'sum'),
# Total_Workers=('Total_No_of_Workers', 'sum'),
# Avg_Daily_Wage=('Average_Wage_rate_per_day_per_person', 'mean'),
# Avg_100_Days_HH_Ratio=('100_Days_HH_Ratio', 'mean'),
# Avg_Payment_Efficiency=('percentage_payments_gererated_within_15_days', 'mean')
# ).reindex(month_order_financial_year) # Order months for consistent plotting
# monthly_trends = monthly_trends.fillna(0)
# print("\n--- Monthly Trends (Aggregated Across All Years, for 2018-2024 FYs) ---") # Updated title
# print(monthly_trends)
# print("\n--- Monthly Trends Describe ---")
# print(monthly_trends.describe())
# fig, ax1 = plt.subplots(figsize=(15, 8))
# color = 'tab:blue'
# ax1.set_xlabel('Month', fontsize=12)
# ax1.set_ylabel('Total Expenditure (Crores ₹)', color=color, fontsize=12)
# line1_monthly = ax1.plot(monthly_trends.index, monthly_trends['Total_Expenditure'] / 1e7, color=color, marker='o', label='Total Expenditure')
# ax1.tick_params(axis='y', labelcolor=color)
# ax1.tick_params(axis='x', rotation=45)
# ax1.grid(axis='y', linestyle='--', alpha=0.7)
# # Annotate points for Total Expenditure (Monthly)
# for x, y in zip(monthly_trends.index, monthly_trends['Total_Expenditure'] / 1e7):
# ax1.annotate(f'{y:.1f} Cr', (x, y), textcoords="offset points", xytext=(0,10), ha='center', fontsize=8, color=color)
# ax2 = ax1.twinx()
# color = 'tab:orange'
# ax2.set_ylabel('Total Workers (Millions)', color=color, fontsize=12)
# line2_monthly = ax2.plot(monthly_trends.index, monthly_trends['Total_Workers'] / 1e6, color=color, marker='x', label='Total Workers')
# ax2.tick_params(axis='y', labelcolor=color)
# # Annotate points for Total Workers (Monthly)
# for x, y in zip(monthly_trends.index, monthly_trends['Total_Workers'] / 1e6):
# ax2.annotate(f'{y:.1f} M', (x, y), textcoords="offset points", xytext=(0,-15), ha='center', fontsize=8, color=color)
# fig.suptitle('Monthly Trends: Total Expenditure and Workers Employed (2018-2024 FYs)', fontsize=16) # Updated title
# lines_combined_monthly = line1_monthly + line2_monthly
# labels_combined_monthly = [l.get_label() for l in lines_combined_monthly]
# ax2.legend(lines_combined_monthly, labels_combined_monthly, loc='upper left', bbox_to_anchor=(0.05, 0.95))
# fig.tight_layout()
# plt.show()
# # Comment for Monthly Trends: Total Expenditure and Workers Employed
# """
# Purpose: To identify seasonal patterns in MGNREGA expenditure and worker engagement, aligned with the Indian financial year, using historically consistent data (2018-2024 FYs).
# Insights: Reveals peak and lean periods of activity within the financial year.
# Interpretation: Activity often aligns with agricultural cycles (e.g., lower demand during peak agricultural seasons, higher during lean periods). Understanding these cycles is crucial for efficient resource planning and matching work availability with demand. This plot aggregates data across all complete financial years. Annotated values provide precise figures.
# """
# fig, axes = plt.subplots(1, 3, figsize=(20, 6))
# fig.suptitle('Monthly Trends: Average Wages, 100-Days HH Ratio, and Payment Efficiency (2018-2024 FYs)', fontsize=16) # Updated title
# # Plot Avg_Daily_Wage by Month
# sns.lineplot(x=monthly_trends.index, y='Avg_Daily_Wage', data=monthly_trends, marker='o', ax=axes[0], color='green')
# axes[0].set_title('Average Daily Wage Rate by Month', fontsize=14)
# axes[0].set_xlabel('Month', fontsize=12)
# axes[0].set_ylabel('Average Daily Wage (₹)', fontsize=12)
# axes[0].tick_params(axis='x', rotation=45)
# axes[0].grid(axis='y', linestyle='--', alpha=0.7)
# # Annotate points
# for x, y in zip(monthly_trends.index, monthly_trends['Avg_Daily_Wage']):
# axes[0].annotate(f'{y:.0f}', (x, y), textcoords="offset points", xytext=(0,10), ha='center', fontsize=8, color='green')
# # Comment for Plot (Avg_Daily_Wage by Month)
# """
# Purpose: To analyze monthly variations in average daily wage rates, aligned with the Indian financial year, for consistent historical data.
# Insights: Can show if wages fluctuate seasonally or are relatively stable throughout the year.
# Interpretation: Significant monthly fluctuations might indicate changes in work types or local economic conditions. Relative stability, especially near the median, suggests consistent wage policy application. Annotated values provide precise figures.
# """
# # Plot Avg_100_Days_HH_Ratio by Month
# sns.lineplot(x=monthly_trends.index, y='Avg_100_Days_HH_Ratio', data=monthly_trends, marker='o', ax=axes[1], color='purple')
# axes[1].set_title('Average 100-Days HH Ratio by Month', fontsize=14)
# axes[1].set_xlabel('Month', fontsize=12)
# axes[1].set_ylabel('Average Ratio', fontsize=12)
# axes[1].tick_params(axis='x', rotation=45)
# axes[1].grid(axis='y', linestyle='--', alpha=0.7)
# # Annotate points
# for x, y in zip(monthly_trends.index, monthly_trends['Avg_100_Days_HH_Ratio']):
# axes[1].annotate(f'{y:.3f}', (x, y), textcoords="offset points", xytext=(0,10), ha='center', fontsize=8, color='purple')
# # Comment for Plot (Avg_100_Days_HH_Ratio by Month)
# """
# Purpose: To identify monthly patterns in the provision of 100 days of employment, aligned with the Indian financial year, for consistent historical data.
# Insights: Reveals if certain months are better or worse for achieving this key scheme objective.
# Interpretation: Seasonal peaks might align with lean agricultural seasons when demand for work is highest. Understanding these patterns can help target interventions. Annotated values provide precise figures.
# """
# # Plot Avg_Payment_Efficiency by Month
# sns.lineplot(x=monthly_trends.index, y='Avg_Payment_Efficiency', data=monthly_trends, marker='o', ax=axes[2], color='brown')
# axes[2].set_title('Average Payment Efficiency by Month', fontsize=14)
# axes[2].set_xlabel('Month', fontsize=12)
# axes[2].set_ylabel('Average Percentage (%)', fontsize=12)
# axes[2].tick_params(axis='x', rotation=45)
# axes[2].grid(axis='y', linestyle='--', alpha=0.7)
# # Annotate points
# for x, y in zip(monthly_trends.index, monthly_trends['Avg_Payment_Efficiency']):
# axes[2].annotate(f'{y:.1f}%', (x, y), textcoords="offset points", xytext=(0,10), ha='center', fontsize=8, color='brown')
# # Comment for Plot (Avg_Payment_Efficiency by Month)
# """
# Purpose: To assess monthly consistency in payment efficiency, aligned with the Indian financial year, for consistent historical data.
# Insights: Can highlight months where administrative bottlenecks or external factors might cause payment delays.
# Interpretation: Stable high efficiency indicates a robust system. Dips can signal areas where the payment mechanism needs strengthening, especially if it coincides with peak work demand. Annotated values provide precise figures.
# """
# plt.tight_layout(rect=[0, 0.03, 1, 0.96])
# plt.show()
# # --- New Visualization: Monthly Trends for Key Metrics by Financial Year ---
# print("\n--- Monthly Trends by Financial Year for Key Metrics (2018-2024 FYs) ---") # Updated title
# # Data preparation for the new plot: Group by fin_year and calendar_month
# df_temp_time['calendar_month_for_plot'] = df_temp_time['month'].astype(str).str.strip().str.title().astype('category')
# df_temp_time['calendar_month_for_plot'] = df_temp_time['calendar_month_for_plot'].map(short_to_full_month).fillna(df_temp_time['calendar_month_for_plot'])
# df_temp_time['calendar_month_for_plot'] = pd.Categorical(df_temp_time['calendar_month_for_plot'], categories=financial_month_order_for_plot, ordered=True) # Use financial year month order
# # Filter out the incomplete 2025-2026 data (already handled by df filtering)
# monthly_yearly_trends_filtered = df_temp_time.groupby(['fin_year', 'calendar_month_for_plot']).agg(
# Total_Expenditure=('Total_Exp', 'sum'),
# Total_Workers=('Total_No_of_Workers', 'sum'),
# Avg_Daily_Wage=('Average_Wage_rate_per_day_per_person', 'mean'),
# Avg_100_Days_HH_Ratio=('100_Days_HH_Ratio', 'mean'),
# Avg_Payment_Efficiency=('percentage_payments_gererated_within_15_days', 'mean')
# ).reset_index().rename(columns={'calendar_month_for_plot': 'Month'})
# # Fill NaNs only for numerical columns in this aggregated DataFrame
# numerical_cols_for_fill = ['Total_Expenditure', 'Total_Workers', 'Avg_Daily_Wage', 'Avg_100_Days_HH_Ratio', 'Avg_Payment_Efficiency']
# for col in numerical_cols_for_fill:
# if col in monthly_yearly_trends_filtered.columns:
# monthly_yearly_trends_filtered[col] = monthly_yearly_trends_filtered[col].fillna(0)
# # Plotting for Total Expenditure by Month for Each Year
# plt.figure(figsize=(18, 10))
# lineplot_exp = sns.lineplot(data=monthly_yearly_trends_filtered, x='Month', y='Total_Expenditure', hue='fin_year', marker='o', palette='tab10')
# plt.title('Monthly Total Expenditure (Crores ₹) by Financial Year (2018-2024 FYs)', fontsize=16) # Updated title
# plt.xlabel('Month', fontsize=12)
# plt.ylabel('Total Expenditure (Crores ₹)', fontsize=12)
# plt.tick_params(axis='x', rotation=45)
# plt.grid(axis='y', linestyle='--', alpha=0.7)
# plt.legend(title='Financial Year', bbox_to_anchor=(1.05, 1), loc='upper left')
# # Annotate points for Monthly Total Expenditure
# for line in lineplot_exp.get_lines():
# for x_val, y_val in zip(line.get_xdata(), line.get_ydata()):
# if pd.notna(y_val) and y_val != 0: # Only annotate non-NaN, non-zero values
# plt.text(x_val, y_val, f'{y_val/1e7:.1f} Cr', fontsize=7, ha='center', va='bottom', color=line.get_color()) # Adjusted formatting
# plt.tight_layout()
# plt.show()
# # Comment for Monthly Total Expenditure by Financial Year
# """
# Purpose: To visualize the month-wise trends of total expenditure for each financial year. Excludes anomalous years. Annotated values provide precise figures at each node.
# Insights: Helps in identifying year-specific anomalies or deviations from typical seasonal patterns. Reveals if program scale increased in certain months in specific years.
# Interpretation: For example, a sudden spike in expenditure during a particular month in one year might correspond to specific project rollouts or disaster relief efforts. This granular view helps in understanding yearly operational nuances.
# """
# # Plotting for Total Workers by Month for Each Year
# plt.figure(figsize=(18, 10))
# lineplot_workers = sns.lineplot(data=monthly_yearly_trends_filtered, x='Month', y='Total_Workers', hue='fin_year', marker='x', palette='tab10')
# plt.title('Monthly Total Workers (Millions) by Financial Year (2018-2024 FYs)', fontsize=16) # Updated title
# plt.xlabel('Month', fontsize=12)
# plt.ylabel('Total Workers (Millions)', fontsize=12)
# plt.tick_params(axis='x', rotation=45)
# plt.grid(axis='y', linestyle='--', alpha=0.7)
# plt.legend(title='Financial Year', bbox_to_anchor=(1.05, 1), loc='upper left')
# # Annotate points for Monthly Total Workers
# for line in lineplot_workers.get_lines():
# for x_val, y_val in zip(line.get_xdata(), line.get_ydata()):
# if pd.notna(y_val) and y_val != 0: # Only annotate non-NaN, non-zero values
# plt.text(x_val, y_val, f'{y_val/1e6:.1f} M', fontsize=7, ha='center', va='bottom', color=line.get_color()) # Adjusted formatting
# plt.tight_layout()
# plt.show()
# # Comment for Monthly Total Workers by Financial Year
# """
# Purpose: To visualize the month-wise trends of total workers employed for each financial year. Excludes anomalous years. Annotated values provide precise figures at each node.
# Insights: Helps in identifying seasonal employment patterns and how they might vary year-on-year.
# Interpretation: Shows how demand for work or availability of projects fluctuates monthly within each financial year. This is vital for understanding employment stability and planning resource allocation.
# """
# # --- Geospatial Analysis (Choropleth Maps) ---
# print("\n--- Starting Geospatial Analysis ---")
# # Load India's state boundaries
# try:
# # IMPORTANT: VERIFY YOUR GEOJSON FILE PATH AND FILENAME!
# if not os.path.exists(GEOJSON_FILE):
# raise FileNotFoundError(f"GeoJSON file not found at: {GEOJSON_FILE}")
# india_states_gdf = geopandas.read_file(GEOJSON_FILE)
# print(f"Loaded GeoJSON from: {GEOJSON_FILE}")
# print(f"GeoDataFrame columns: {india_states_gdf.columns.tolist()}")
# # Check if the loaded GeoJSON is state-level or district-level
# is_state_level_geojson = 'NAME_1' in india_states_gdf.columns and 'NAME_2' not in india_states_gdf.columns
# if not is_state_level_geojson:
# print("Warning: Loaded GeoJSON appears to be district-level or different structure. Attempting to dissolve to state level.")
# if 'NAME_1' in india_states_gdf.columns:
# # Dissolve by NAME_1 to get state boundaries from a district-level file
# india_states_gdf['State_Clean_for_Dissolve'] = india_states_gdf['NAME_1'].astype(str).str.upper().str.replace('&', 'AND').str.replace('.', '').str.replace(' ', '').str.strip()
# india_states_gdf = india_states_gdf.dissolve(by='State_Clean_for_Dissolve')
# # After dissolve, the 'NAME_1' column might become the index or a regular column depending on geopandas version
# if 'State_Clean_for_Dissolve' in india_states_gdf.index.names:
# india_states_gdf = india_states_gdf.reset_index()
# print("Successfully dissolved district-level GeoJSON to state boundaries.")
# else:
# print("Error: Cannot dissolve GeoJSON to state level. 'NAME_1' column not found. Map plotting might fail or be incorrect.")
# india_states_gdf['State_Clean'] = ''
# # IMPORTANT: Consistent state name cleaning for both GeoDataFrame and aggregated data
# # Standardize names: remove spaces, replace '&' with 'AND', remove dots.
# if 'NAME_1' in india_states_gdf.columns:
# india_states_gdf['State_Clean'] = india_states_gdf['NAME_1'].astype(str).str.upper().str.replace('&', 'AND').str.replace('.', '').str.replace(' ', '').str.strip()
# elif 'State_Clean_for_Dissolve' in india_states_gdf.columns:
# india_states_gdf['State_Clean'] = india_states_gdf['State_Clean_for_Dissolve'].astype(str).str.upper().str.replace('&', 'AND').str.replace('.', '').str.replace(' ', '').str.strip()
# elif 'NAME_LOCAL' in india_states_gdf.columns:
# india_states_gdf['State_Clean'] = india_states_gdf['NAME_LOCAL'].astype(str).str.upper().str.replace('&', 'AND').str.replace('.', '').str.replace(' ', '').str.strip()
# else:
# print("Warning: Could not find standard state name column in GeoJSON for final cleaning. Merge might fail.")
# india_states_gdf['State_Clean'] = india_states_gdf.index.astype(str).str.upper().str.replace('&', 'AND').str.replace('.', '').str.replace(' ', '').str.strip()
# state_performance_reset = state_performance.reset_index()
# state_performance_reset['State_Clean'] = state_performance_reset['State'].astype(str).str.upper().str.replace('&', 'AND').str.replace('.', '').str.replace(' ', '').str.strip()
# # Debugging state name mismatches
# mgnrega_states = set(state_performance_reset['State_Clean'].unique())
# geojson_states = set(india_states_gdf['State_Clean'].unique())
# print(f"\nStates in MGNREGA data: {sorted(list(mgnrega_states))}")
# print(f"States in GeoJSON data: {sorted(list(geojson_states))}")
# print(f"States in MGNREGA but not in GeoJSON: {mgnrega_states - geojson_states}")
# print(f"States in GeoJSON but not in MGNREGA: {geojson_states - mgnrega_states}")
# common_states = list(mgnrega_states.intersection(geojson_states))
# if not common_states:
# print("Warning: No common states found between MGNREGA data and GeoJSON. Map plotting might be empty or incorrect.")
# print("This is often due to naming inconsistencies. Please review the 'States in MGNREGA data' and 'States in GeoJSON data' lists above.")
# merged_gdf = india_states_gdf.merge(state_performance_reset, left_on='State_Clean', right_on='State_Clean', how='left')
# for col in state_performance.columns:
# if col in merged_gdf.columns:
# if merged_gdf[col].isnull().any():
# merged_gdf[col] = merged_gdf[col].fillna(0)
# # Plot Choropleth Maps for key metrics
# fig, axes = plt.subplots(3, 2, figsize=(20, 25))
# fig.suptitle('Geographical Distribution of Key MGNREGA Performance Indicators by State', fontsize=20)
# map_metrics = [
# ('Total_Expenditure', 'Total Expenditure (Crores ₹)', plt.cm.viridis),
# ('Total_Workers', 'Total Workers (Millions)', plt.cm.magma),
# ('Avg_Daily_Wage', 'Average Daily Wage (₹)', plt.cm.plasma),
# ('Avg_100_Days_HH_Ratio', 'Average 100-Days HH Ratio', plt.cm.coolwarm),
# ('Avg_Payment_Efficiency', 'Average Payment Efficiency (%)', plt.cm.OrRd),
# ('Avg_Women_Persondays_Ratio', 'Average Women Persondays Ratio', plt.cm.viridis)
# ]
# axes = axes.flatten()
# for i, (metric, title, cmap) in enumerate(map_metrics):
# ax = axes[i]
# plot_data = merged_gdf[metric]
# if 'Expenditure' in title:
# plot_data = plot_data / 1e7
# elif 'Workers' in title:
# plot_data = plot_data / 1e6
# vmin, vmax = None, None
# if 'Ratio' in title or 'Percentage' in title:
# vmin = 0
# vmax = 1.0 if 'Ratio' in title else 100.0
# # Ensure plot_data has non-NaN values for mapping, fill with 0 if needed for states with no data
# plot_data_for_map = plot_data.fillna(0) # Temporarily fill for plotting color, missing_kwds still takes over for legend
# merged_gdf.plot(column=plot_data_for_map, cmap=cmap, linewidth=0.8, ax=ax, edgecolor='0.8', legend=True,
# legend_kwds={'label': title, 'orientation': "horizontal", 'shrink': 0.7},
# missing_kwds={"color": "lightgrey", "label": "No Data"},
# vmin=vmin, vmax=vmax
# )
# ax.set_title(title, fontsize=16)
# ax.set_axis_off()
# # Add state names as text labels (selectively for larger states for clarity)
# # Use merged_gdf['State'] for original state names if available
# for idx, row in merged_gdf.iterrows():
# # Only label states that have data and are large enough for the label to be clear
# # Filter for larger states or states with more central geometry to avoid clutter
# if pd.notna(row[metric]) and row[metric] > 0 and row['State_Clean'] in common_states:
# # Heuristic for selective labeling on India map to prevent clutter
# if row['State_Clean'] in ['UTTARPRADESH', 'MADHYAPRADESH', 'RAJASTHAN', 'MAHARASHTRA',
# 'ANDHRAPRADESH', 'KARNATAKA', 'GUJARAT', 'ODISHA',
# 'CHHATTISGARH', 'WESTBENGAL', 'BIHAR', 'ASSAM', 'KERALA',
# 'PUNJAB', 'HARYANA', 'JHARKHAND', 'UTTARAKHAND']:
# try:
# centroid = row.geometry.centroid
# # Adjust text properties based on expected background color from colormap or general contrast
# # For viridis/magma/plasma, black text is generally good. For OrRd, black/dark color needed.
# ax.text(centroid.x, centroid.y, row['State'], fontsize=7, ha='center', va='center', color='black', alpha=0.9,
# bbox=dict(boxstyle="round,pad=0.1", fc='white', ec='none', alpha=0.5)) # Add bbox for readability
# except Exception as e:
# print(f"Could not add label for {row['State']}: {e}")
# plt.tight_layout(rect=[0, 0.03, 1, 0.98])
# plt.show()
# print("\n--- Geospatial Analysis Completed ---")
# except FileNotFoundError:
# print(f"Error: GeoJSON file '{GEOJSON_FILE}' not found. Skipping geospatial analysis.")
# print("Please ensure you have the GeoJSON file for Indian administrative boundaries in the specified path.")
# print("Example: ML_CA1/Datasets/GeoJSON/gadm41_IND_1.json (for states) or gadm41_IND_2.json (if you want to dissolve districts to states).")
# except Exception as e:
# print(f"An unexpected error occurred during geospatial analysis: {e}")
# print("Skipping geospatial analysis. Error details:")
# import traceback
# traceback.print_exc()
# # --- Bivariate Analysis: Correlation Matrix ---
# print("\n--- Correlation Matrix of Key Numerical Metrics ---")
# correlation_cols = [
# 'Approved_Labour_Budget', 'Total_Exp', 'Wages', 'Persondays_of_Central_Liability_so_far',
# 'Total_No_of_Workers', 'Total_Households_Worked', 'Total_Individuals_Worked',
# 'Average_Wage_rate_per_day_per_person', 'percentage_payments_gererated_within_15_days',
# 'Women_Persondays_Ratio', 'SC_Persondays_Ratio', 'ST_Persondays_Ratio',
# '100_Days_HH_Ratio', 'percent_of_NRM_Expenditure', 'percent_of_Expenditure_on_Agriculture_Allied_Works'
# ]
# correlation_matrix = df[correlation_cols].corr()
# plt.figure(figsize=(14, 10))
# sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
# plt.title('Correlation Matrix of Key MGNREGA Metrics', fontsize=16)
# plt.xticks(rotation=45, ha='right')
# plt.yticks(rotation=0)
# plt.tight_layout()
# plt.show()
# # Comment for Correlation Matrix
# """
# Purpose: To quantitatively assess the linear relationships between various key numerical metrics.
# Insights: High correlation (close to +1 or -1) indicates that two variables tend to move together or in opposite directions.
# Interpretation: For instance, a strong positive correlation between 'Total_Exp' and 'Total_Workers' would indicate that higher spending leads to more employment. This helps identify interdependencies and potential drivers for different aspects of the scheme. Weak correlations might suggest that metrics are independent, or that non-linear relationships exist.
# """
# print("\n--- EDA Step 3 Completed. Dataset is ready for in-depth analysis and insights. ---")
In [ ]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas # Import geopandas for geospatial analysis
from matplotlib.offsetbox import AnnotationBbox, OffsetImage # For image annotations on maps
# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')
# --- Configuration for file paths ---
PROJECT_ROOT = ""
CLEANED_DATA_DIR = os.path.join(PROJECT_ROOT, "Datasets", "Cleaned_Preprocessed")
FULLY_CLEANED_CSV_FILE = os.path.join(CLEANED_DATA_DIR, "mgnrega_data_fully_cleaned.csv")
# IMPORTANT: Ensure this GeoJSON file exists in your project structure:
# ML_CA1/Datasets/GeoJSON/gadm41_IND_1.json (for states) is preferred.
# If you use gadm41_IND_2.json (district level), we will dissolve it to state level.
GEOJSON_FILE = os.path.join(PROJECT_ROOT, "Datasets", "GEOJSON", "gadm41_IND_1.json") # Prefer state-level GeoJSON
# Define the path for the final cleaned, sorted, and FILTERED CSV
FINAL_FILTERED_SORTED_CSV_FILE = os.path.join(CLEANED_DATA_DIR, "mgnrega_data_filtered_for_analysis.csv")
# Load the fully cleaned dataset with robust data handling
try:
df = pd.read_csv(FULLY_CLEANED_CSV_FILE)
# --- Robust Numerical Column Re-conversion AFTER CSV Load ---
# Ensure all numerical columns are correctly typed and handle inf/NaN.
numerical_cols_to_reconvert = [
'Approved_Labour_Budget', 'Average_Wage_rate_per_day_per_person',
'Average_days_of_employment_provided_per_Household', 'Differently_abled_persons_worked',
'Material_and_skilled_Wages', 'Number_of_Completed_Works',
'Number_of_GPs_with_NIL_exp', 'Number_of_Ongoing_Works',
'Persondays_of_Central_Liability_so_far', 'SC_persondays',
'SC_workers_against_active_workers', 'ST_persondays',
'ST_workers_against_active_workers', 'Total_Adm_Expenditure', 'Total_Exp',
'Total_Households_Worked', 'Total_Individuals_Worked',
'Total_No_of_Active_Job_Cards', 'Total_No_of_Active_Workers',
'Total_No_of_HHs_completed_100_Days_of_Wage_Employment',
'Total_No_of_JobCards_issued', 'Total_No_of_Workers',
'Total_No_of_Works_Takenup', 'Wages', 'Women_Persondays',
'percent_of_Category_B_Works', 'percent_of_Expenditure_on_Agriculture_Allied_Works',
'percent_of_NRM_Expenditure', 'percentage_payments_gererated_within_15_days'
]
for col in numerical_cols_to_reconvert:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce')
df[col] = df[col].replace([np.inf, -np.inf], np.nan).fillna(0) # Handle inf/NaN with 0
df['state_code'] = df['state_code'].fillna(-1).astype(int)
df['district_code'] = df['district_code'].fillna(-1).astype(int)
# --- NO GLOBAL DATE COLUMN CREATION (as discussed, will derive as needed) ---
df['State'] = df['State'].astype('category')
df['District'] = df['District'].astype('category')
print(f"Loaded fully cleaned data from: {FULLY_CLEANED_CSV_FILE}")
print(f"DataFrame shape after robust numerical handling: {df.shape}")
# --- Filter out 2024-2025 and 2025-2026 for consistent historical analysis (NEW) ---
print("\n--- Filtering Data for Consistent Historical Analysis (Excluding 2024-2025 and 2025-2026) ---")
years_to_exclude = ['2024-2025', '2025-2026']
df = df[~df['fin_year'].isin(years_to_exclude)].copy() # Filter the main DataFrame
print(f"DataFrame shape after filtering {years_to_exclude}: {df.shape}")
print(f"Financial years remaining in data: {df['fin_year'].unique()}")
# --- Data Sorting ---
print("\n--- Sorting DataFrame by Financial Year, Month (Financial Order), State Code, and District Code ---")
# 1. Define month order for Indian Financial Year (April to March)
financial_month_order = ['April', 'May', 'June', 'July', 'August', 'September',
'October', 'November', 'December', 'January', 'February', 'March']
# Create a more robust month_to_num mapping to handle both full and short month names
# This dictionary is mainly for converting to numerical keys for comparison/sorting if needed,
# but the primary sorting will leverage CategoricalDtype
month_to_num_for_plotting = { # Use a different name to avoid confusion with sorting key
'April': 0, 'May': 1, 'June': 2, 'July': 3, 'August': 4, 'September': 5,
'October': 6, 'November': 7, 'December': 8, 'January': 9, 'February': 10, 'March': 11,
'Apr': 0, 'May': 1, 'Jun': 2, 'Jul': 3, 'Aug': 4, 'Sep': 5,
'Oct': 6, 'Nov': 7, 'Dec': 8, 'Jan': 9, 'Feb': 10, 'Mar': 11
}
# 2. Create numerical sorting key for fin_year
df['fin_year_sort_key'] = df['fin_year'].astype(str).str.split('-').str[0].astype(int)
# 3. Process 'month' column and convert to an ORDERED CategoricalDtype
df['month_processed'] = df['month'].astype(str).str.strip().str.title()
df['month_full_name_for_sort'] = df['month_processed'].apply(
lambda x: next((m_full for m_full in financial_month_order if m_full.startswith(x)), x)
)
df['month_ordered'] = pd.Categorical(df['month_full_name_for_sort'], categories=financial_month_order, ordered=True)
# 4. Sort the DataFrame using the custom ordered 'month_ordered'
df.sort_values(by=['fin_year_sort_key', 'month_ordered', 'state_code', 'district_code'], inplace=True)
# 5. Drop temporary sorting keys
df.drop(columns=['fin_year_sort_key', 'month_processed', 'month_full_name_for_sort', 'month_ordered'], inplace=True, errors='ignore')
print("DataFrame sorted successfully.")
print(df[['fin_year', 'month', 'state_code', 'State', 'district_code', 'District']].head(20))
# --- Save the fully cleaned, sorted, and FILTERED DataFrame (UPDATED PATH) ---
df.to_csv(FINAL_FILTERED_SORTED_CSV_FILE, index=False)
print(f"\nFiltered data for analysis saved to: {FINAL_FILTERED_SORTED_CSV_FILE}")
except FileNotFoundError:
print(f"Error: The file '{FULLY_CLEANED_CSV_FILE}' was not found. Please ensure the path is correct.")
exit()
except Exception as e:
print(f"An unexpected error occurred while loading the dataset: {e}")
exit()
print("\n--- Starting EDA Step 3: Refined Visualizations and Deep Dive into Trends and Patterns ---")
# --- Feature Engineering for deeper insights (re-run to ensure consistency) ---
df['Women_Persondays_Ratio'] = df['Women_Persondays'] / df['Persondays_of_Central_Liability_so_far']
df['Women_Persondays_Ratio'] = df['Women_Persondays_Ratio'].replace([np.inf, -np.inf], np.nan).fillna(0)
df['Women_Persondays_Ratio'] = np.clip(df['Women_Persondays_Ratio'], 0, 1)
df['SC_Persondays_Ratio'] = df['SC_persondays'] / df['Persondays_of_Central_Liability_so_far']
df['SC_Persondays_Ratio'] = df['SC_Persondays_Ratio'].replace([np.inf, -np.inf], np.nan).fillna(0)
df['SC_Persondays_Ratio'] = np.clip(df['SC_Persondays_Ratio'], 0, 1)
df['ST_Persondays_Ratio'] = df['ST_persondays'] / df['Persondays_of_Central_Liability_so_far']
df['ST_Persondays_Ratio'] = df['ST_Persondays_Ratio'].replace([np.inf, -np.inf], np.nan).fillna(0)
df['ST_Persondays_Ratio'] = np.clip(df['ST_Persondays_Ratio'], 0, 1)
df['100_Days_HH_Ratio'] = df['Total_No_of_HHs_completed_100_Days_of_Wage_Employment'] / df['Total_Households_Worked']
df['100_Days_HH_Ratio'] = df['100_Days_HH_Ratio'].replace([np.inf, -np.inf], np.nan).fillna(0)
df['100_Days_HH_Ratio'] = np.clip(df['100_Days_HH_Ratio'], 0, 1)
print("\n--- Feature Engineering Completed ---")
# --- Analysis by State (Visualizations with refined plots and explicit hue) ---
print("\n--- State-wise Aggregated Metrics ---")
# Aggregate all state performance metrics
state_performance = df.groupby('State').agg(
Total_Expenditure=('Total_Exp', 'sum'),
Total_Workers=('Total_No_of_Workers', 'sum'),
Avg_Daily_Wage=('Average_Wage_rate_per_day_per_person', 'mean'),
Avg_100_Days_HH_Ratio=('100_Days_HH_Ratio', 'mean'),
Avg_Payment_Efficiency=('percentage_payments_gererated_within_15_days', 'mean'),
Total_Women_Persondays=('Women_Persondays', 'sum'),
Avg_Women_Persondays_Ratio=('Women_Persondays_Ratio', 'mean'),
Avg_NRM_Exp_Percent=('percent_of_NRM_Expenditure', 'mean')
)
print("\n--- State Performance Aggregated Data (Head) ---")
print(state_performance.head())
print("\n--- State Performance Aggregated Data (Describe) ---")
print(state_performance.describe())
# Visualizations: Bar Charts for State Performance - Now showing all states, sorted by value, with value labels
fig, axes = plt.subplots(3, 2, figsize=(20, 25))
fig.suptitle('State-wise Key MGNREGA Performance Indicators', fontsize=22) # Changed title
plt.subplots_adjust(left=0.1, right=0.9, top=0.92, bottom=0.08, hspace=0.4, wspace=0.3)
# Helper function to plot bar charts for state performance with values
def plot_state_bar_chart_with_values(ax, data, x_col, y_col, title, x_label, y_label, formatter=None, palette='viridis', ascending=False):
# Sort data for plotting (highest values at the top)
sorted_data = data.sort_values(by=x_col, ascending=ascending).reset_index()
if not sorted_data.empty and not sorted_data[x_col].isnull().all() and not (sorted_data[x_col] == 0).all():
bars = sns.barplot(x=x_col, y=y_col, data=sorted_data, ax=ax, palette=palette)
ax.set_title(title, fontsize=16)
ax.set_xlabel(x_label, fontsize=13)
ax.set_ylabel(y_label, fontsize=13)
if formatter:
ax.xaxis.set_major_formatter(plt.FuncFormatter(formatter))
ax.tick_params(axis='x', labelsize=11)
ax.tick_params(axis='y', labelsize=11)
ax.grid(axis='x', linestyle='--', alpha=0.7)
# Annotate bars with values
for container in ax.containers: # Iterate through containers for bar_label
for bar in container.patches: # Iterate through individual bars
x_val = bar.get_width()
y_val = bar.get_y() + bar.get_height() / 2
# Determine text color based on background luminance (simple heuristic)
text_color = 'black' # Default text color
if formatter: # Use formatted value to check
formatted_val = formatter(x_val, None)
if 'Cr' in formatted_val or 'M' in formatted_val:
# For large monetary/worker values, assume darker colors or high variance, use white
if x_val > ax.get_xlim()[1] * 0.5: # If bar is past half way, use white
text_color = 'white'
elif 'Ratio' in formatted_val or '%' in formatted_val:
# For percentages/ratios, if value is high, bar is dark, use white
if x_val > (ax.get_xlim()[1] * 0.6 if ax.get_xlim()[1] > 1 else ax.get_xlim()[1] * 0.6): # Adjusted for ratio scale
text_color = 'white'
ax.text(x_val, y_val, formatted_val if formatter else f'{x_val:.0f}',
color=text_color, ha='right', va='center', fontsize=8, weight='bold',
bbox=dict(boxstyle="round,pad=0.1", fc='white', ec='none', alpha=0.6)) # Add bbox for readability, slightly transparent
# --- Dynamic xlim adjustment for better visibility ---
if sorted_data[x_col].max() > 0:
current_max_x = sorted_data[x_col].max()
if 'Cr' in x_label:
ax.set_xlim(left=0, right=current_max_x * 1.2) # Increased right margin for labels
elif 'M' in x_label:
ax.set_xlim(left=0, right=current_max_x * 1.2)
elif 'Ratio' in x_label:
ax.set_xlim(left=0, right=1.0)
elif 'Percentage' in x_label:
ax.set_xlim(left=0, right=100.0)
elif 'Wage' in x_label:
ax.set_xlim(left=0, right=max(current_max_x * 1.2, 850))
else:
ax.set_xlim(left=0, right=current_max_x * 1.2)
else:
ax.set_title(f"No meaningful data for {title.split('by ')[1]}", fontsize=16)
ax.text(0.5, 0.5, 'Data not available or all zeros', horizontalalignment='center', verticalalignment='center', transform=ax.transAxes, fontsize=14, color='red')
ax.set_visible(True)
ax.set_xticks([])
ax.set_yticks([])
# Plot 1: States by Total Expenditure (₹)
plot_state_bar_chart_with_values(axes[0, 0], state_performance,
'Total_Expenditure', 'State',
'States by Total Expenditure (₹)', 'Total Expenditure', 'State',
lambda x, _: f'{x/1e7:.1f} Cr', 'viridis', ascending=True)
# Comment for Plot 1
"""
Purpose: To visualize the total financial investment in MGNREGA across all states.
Insights: Identifies states with the largest and smallest overall expenditure.
Interpretation: Highlights states with substantial program scale. Large disparities suggest varying levels of program implementation or regional demands for work. The bar labels provide exact figures, aiding precise comparison.
"""
# Plot 2: States by Total Workers Employed
plot_state_bar_chart_with_values(axes[0, 1], state_performance,
'Total_Workers', 'State', 'States by Total Workers Employed',
'Total Workers', 'State', lambda x, _: f'{x/1e6:.1f} M', 'magma', ascending=True)
# Comment for Plot 2
"""
Purpose: To visualize the total number of workers engaged in MGNREGA activities across all states.
Insights: Shows the geographical reach of the scheme in terms of human resource engagement.
Interpretation: High worker numbers are critical for livelihood support. These figures are cumulative over time, not unique individuals, indicating the volume of employment generated. Labels offer precise worker counts.
"""
# Plot 3: States by Average Daily Wage (₹)
plot_state_bar_chart_with_values(axes[1, 0], state_performance,
'Avg_Daily_Wage', 'State', 'States by Average Daily Wage (₹)',
'Average Daily Wage (₹)', 'State', palette='plasma', ascending=True)
# Comment for Plot 3
"""
Purpose: To compare the average daily wage rates offered by MGNREGA across different states.
Insights: Reveals regional variations in compensation for work under the scheme.
Interpretation: Differences in average wages can stem from variations in state-specific minimum wages or the nature of work (skilled vs. unskilled). Higher wages improve worker earnings, while lower wages might indicate areas needing policy review to ensure fair compensation.
"""
# Plot 4: States by Average 100-Days HH Ratio
plot_state_bar_chart_with_values(axes[1, 1], state_performance,
'Avg_100_Days_HH_Ratio', 'State', 'States by Average 100-Days HH Ratio',
'Average Ratio', 'State', palette='coolwarm', ascending=True)
# Comment for Plot 4
"""
Purpose: To assess the proportion of households completing the guaranteed 100 days of employment across states.
Insights: A key indicator of the scheme's effectiveness in providing sustained livelihood support.
Interpretation: Very low ratios across all states suggest a significant gap between the scheme's intent and its execution on the ground. States with higher ratios are more successful in this core objective. This gap could be due to demand-side issues (workers not seeking 100 days) or supply-side issues (lack of available work, administrative hurdles).
"""
# Plot 5: States by Average Payment Efficiency (%)
plot_state_bar_chart_with_values(axes[2, 0], state_performance,
'Avg_Payment_Efficiency', 'State', 'States by Average Payment Efficiency (%)',
'Average Percentage (%)', 'State', palette='OrRd', ascending=True)
# Comment for Plot 5
"""
Purpose: To visualize states' administrative efficiency in disbursing wages within the stipulated 15 days.
Insights: Highlights states with high and low adherence to timely payment.
Interpretation: High payment efficiency is crucial for worker welfare, reducing financial stress and increasing trust in the scheme. Low efficiency in some states indicates administrative bottlenecks that require urgent attention.
"""
# Plot 6: States by Average Women Persondays Ratio
plot_state_bar_chart_with_values(axes[2, 1], state_performance,
'Avg_Women_Persondays_Ratio', 'State', 'States by Average Women Persondays Ratio',
'Average Ratio', 'State', palette='viridis', ascending=True)
# Comment for Plot 6
"""
Purpose: To assess gender inclusivity and the proportion of employment generated for women across states.
Insights: Reveals states that are more successful in mobilizing and providing work for women.
Interpretation: Higher ratios signify better gender equity in MGNREGA implementation. This is a vital social equity metric, indicating successful efforts in empowering women in the rural workforce. Lower ratios might suggest cultural barriers or lack of suitable work for women in those regions.
"""
plt.tight_layout(rect=[0, 0.03, 1, 0.96])
plt.show()
# --- Analysis by Time (Financial Year and Month) ---
print("\n--- Trends Over Financial Years ---")
# --- Deriving calendar_year and calendar_month on-the-fly for time series ---
# This ensures we are not relying on a globally created 'Date' column and its potential issues
# Updated month_order to reflect Indian financial year starting April
financial_month_order_for_plot = ['April', 'May', 'June', 'July', 'August', 'September',
'October', 'November', 'December', 'January', 'February', 'March']
short_to_full_month_map = {
'Jan': 'January', 'Feb': 'February', 'Mar': 'March', 'Apr': 'April',
'May': 'May', 'Jun': 'June', 'Jul': 'July', 'Aug': 'August',
'Sep': 'September', 'Oct': 'October', 'Nov': 'November', 'Dec': 'December'
}
month_to_num_for_fy = {month: i for i, month in enumerate(financial_month_order_for_plot)}
# Create a temporary DataFrame for time series analysis
df_temp_time = df.copy() # df is already filtered now
# Ensure 'month' column is clean before mapping
df_temp_time['month_cleaned_for_temp'] = df_temp_time['month'].astype(str).str.strip().str.title()
# Map short names to full names, and then get numerical order
df_temp_time['month_full_name'] = df_temp_time['month_cleaned_for_temp'].map(short_to_full_month_map).fillna(df_temp_time['month_cleaned_for_temp'])
df_temp_time['month_num_fy_order'] = df_temp_time['month_full_name'].map(month_to_num_for_fy)
df_temp_time['start_year_temp'] = df_temp_time['fin_year'].astype(str).str.split('-').str[0].astype(int)
# Corrected get_calendar_year_for_temp logic for Indian financial year (April-March)
def get_calendar_year_for_temp(row):
start_year = row['start_year_temp']
month_num_fy_order = row['month_num_fy_order'] # Using the order number for current year determination
if pd.isna(start_year) or pd.isna(month_num_fy_order):
return np.nan
# If month is April-Dec (0-8 in financial_month_order_for_plot index), it's the start year
# If month is Jan-Mar (9-11 in financial_month_order_for_plot index), it's start_year + 1
if month_num_fy_order >= month_to_num_for_fy['April']: # Months from April to December (index 0 to 8)
return int(start_year)
else: # Months January to March (index 9 to 11)
return int(start_year + 1)
df_temp_time['calendar_year_temp'] = df_temp_time.apply(get_calendar_year_for_temp, axis=1)
# Drop rows with NaNs in necessary temporal columns
df_temp_time.dropna(subset=['month_num_fy_order', 'calendar_year_temp'], inplace=True)
df_temp_time['calendar_year_temp'] = df_temp_time['calendar_year_temp'].astype(int)
# Filter for yearly trends (no further filtering needed, df is already filtered)
yearly_trends = df_temp_time.groupby('fin_year').agg(
Total_Expenditure=('Total_Exp', 'sum'),
Total_Workers=('Total_No_of_Workers', 'sum'),
Avg_Daily_Wage=('Average_Wage_rate_per_day_per_person', 'mean'),
Avg_100_Days_HH_Ratio=('100_Days_HH_Ratio', 'mean'),
Avg_Payment_Efficiency=('percentage_payments_gererated_within_15_days', 'mean')
).sort_values(by='fin_year')
print("Yearly Trends (Operating on filtered data):")
print(yearly_trends)
# Refined Yearly Trends Plot: Using two separate line plots for clarity
fig, ax1 = plt.subplots(figsize=(15, 8))
# Plot Total Expenditure
color = 'tab:blue'
ax1.set_xlabel('Financial Year', fontsize=12)
ax1.set_ylabel('Total Expenditure (Crores ₹)', color=color, fontsize=12)
line1 = ax1.plot(yearly_trends.index, yearly_trends['Total_Expenditure'] / 1e7, color=color, marker='o', label='Total Expenditure')
ax1.tick_params(axis='y', labelcolor=color)
ax1.tick_params(axis='x', rotation=45)
ax1.grid(axis='y', linestyle='--', alpha=0.7)
# Annotate points for Total Expenditure
for x, y in zip(yearly_trends.index, yearly_trends['Total_Expenditure'] / 1e7):
ax1.annotate(f'{y:.1f} Cr', (x, y), textcoords="offset points", xytext=(0,10), ha='center', fontsize=8, color=color)
# Plot Total Workers on a twin axis
ax2 = ax1.twinx()
color = 'tab:orange'
ax2.set_ylabel('Total Workers (Millions)', color=color, fontsize=12)
line2 = ax2.plot(yearly_trends.index, yearly_trends['Total_Workers'] / 1e6, color=color, marker='x', label='Total Workers')
ax2.tick_params(axis='y', labelcolor=color)
# Annotate points for Total Workers
for x, y in zip(yearly_trends.index, yearly_trends['Total_Workers'] / 1e6):
ax2.annotate(f'{y:.1f} M', (x, y), textcoords="offset points", xytext=(0,-15), ha='center', fontsize=8, color=color)
fig.suptitle('Yearly Trends: Total Expenditure and Workers Employed (2018-2024)', fontsize=16) # Updated title
# Add a combined legend from both axes
lines_combined = line1 + line2
labels_combined = [l.get_label() for l in lines_combined]
ax2.legend(lines_combined, labels_combined, loc='upper left', bbox_to_anchor=(0.05, 0.95))
fig.tight_layout()
plt.show()
# Comment for Yearly Trends: Total Expenditure and Workers Employed
"""
Purpose: To visualize the overall temporal evolution of financial investment and human resource utilization under MGNREGA for historically consistent years.
Insights: Reveals major spikes or dips in activity for comparable periods. Excludes 2024-2025 and 2025-2026 for accurate trend representation due to data granularity and completeness issues.
Interpretation: A generally increasing trend over years indicates program expansion or increased demand. Consistent trend analysis is now possible. Scaling to crores and millions helps in readability for large numbers. Annotated values provide precise figures for each year.
"""
# Separate plots for other yearly trends for clarity
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
fig.suptitle('Yearly Trends: Average Wages, 100-Days HH Ratio, and Payment Efficiency (2018-2024)', fontsize=16) # Updated title
# Plot Avg_Daily_Wage by Year
sns.lineplot(x=yearly_trends.index, y='Avg_Daily_Wage', data=yearly_trends, marker='o', ax=axes[0], color='green')
axes[0].set_title('Average Daily Wage Rate by Year', fontsize=14)
axes[0].set_xlabel('Financial Year', fontsize=12)
axes[0].set_ylabel('Average Daily Wage (₹)', fontsize=12)
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(axis='y', linestyle='--', alpha=0.7)
# Annotate points
for x, y in zip(yearly_trends.index, yearly_trends['Avg_Daily_Wage']):
axes[0].annotate(f'{y:.0f}', (x, y), textcoords="offset points", xytext=(0,10), ha='center', fontsize=8, color='green')
# Comment for Plot (Avg_Daily_Wage)
"""
Purpose: To track the trend of average daily wages provided under MGNREGA over time for consistent years.
Insights: Shows whether worker income is increasing or stagnating.
Interpretation: A consistent upward trend is a positive sign for improving livelihood standards under the scheme. Any stagnation or decrease would warrant concern and policy review. Excludes anomalous years. Annotated values provide precise figures.
"""
# Plot Avg_100_Days_HH_Ratio by Year
sns.lineplot(x=yearly_trends.index, y='Avg_100_Days_HH_Ratio', data=yearly_trends, marker='o', ax=axes[1], color='purple')
axes[1].set_title('Average 100-Days HH Ratio by Year', fontsize=14)
axes[1].set_xlabel('Financial Year', fontsize=12)
axes[1].set_ylabel('Average Ratio', fontsize=12)
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(axis='y', linestyle='--', alpha=0.7)
# Annotate points
for x, y in zip(yearly_trends.index, yearly_trends['Avg_100_Days_HH_Ratio']):
axes[1].annotate(f'{y:.3f}', (x, y), textcoords="offset points", xytext=(0,10), ha='center', fontsize=8, color='purple')
# Comment for Plot (Avg_100_Days_HH_Ratio)
"""
Purpose: To understand the longitudinal effectiveness of providing the full 100 days of guaranteed work for consistent years.
Insights: Reveals the scheme's success in delivering its core promise. Low or declining trends highlight challenges.
Interpretation: Observed trends in this ratio are critical for policy attention. Possible reasons for fluctuations include demand for work, administrative bottlenecks, or policy shifts. Annotated values provide precise figures.
"""
# Plot Avg_Payment_Efficiency by Year
sns.lineplot(x=yearly_trends.index, y='Avg_Payment_Efficiency', data=yearly_trends, marker='o', ax=axes[2], color='brown')
axes[2].set_title('Average Payment Efficiency by Year', fontsize=14)
axes[2].set_xlabel('Financial Year', fontsize=12)
axes[2].set_ylabel('Average Percentage (%)', fontsize=12)
axes[2].tick_params(axis='x', rotation=45)
axes[2].grid(axis='y', linestyle='--', alpha=0.7)
# Annotate points
for x, y in zip(yearly_trends.index, yearly_trends['Avg_Payment_Efficiency']):
axes[2].annotate(f'{y:.1f}%', (x, y), textcoords="offset points", xytext=(0,10), ha='center', fontsize=8, color='brown')
# Comment for Plot (Avg_Payment_Efficiency)
"""
Purpose: To monitor the efficiency of wage disbursement over financial years for consistent years.
Insights: Timely payments are crucial for worker welfare. A high and stable percentage is ideal.
Interpretation: Observed efficiency trends are critical for understanding administrative performance. Any dips warrant investigation, as payment delays can severely impact the financial stability of workers. Annotated values provide precise figures.
"""
plt.tight_layout(rect=[0, 0.03, 1, 0.96])
plt.show()
# Monthly trends (considering all years for seasonality)
# df is already filtered, so df_temp_time will also be filtered
df_filtered_for_monthly = df_temp_time.copy()
df_filtered_for_monthly.dropna(subset=['month_num_fy_order', 'calendar_year_temp'], inplace=True)
# Debugging the content of df_filtered_for_monthly just before monthly aggregation
print("\n--- Debugging df_filtered_for_monthly before Monthly Aggregation ---")
print(f"Shape: {df_filtered_for_monthly.shape}")
print(f"month_num_fy_order nulls: {df_filtered_for_monthly['month_num_fy_order'].isnull().sum()}")
print(f"calendar_year_temp nulls: {df_filtered_for_monthly['calendar_year_temp'].isnull().sum()}")
print(f"Financial years present: {df_filtered_for_monthly['fin_year'].unique()}")
print(f"Sample of data for monthly trends:\n{df_filtered_for_monthly[['fin_year', 'month', 'Total_Exp', 'Total_No_of_Workers', 'Average_Wage_rate_per_day_per_person']].head()}")
print(f"Sum of Total_Exp in df_filtered_for_monthly: {df_filtered_for_monthly['Total_Exp'].sum()}")
print(f"Sum of Total_No_of_Workers in df_filtered_for_monthly: {df_filtered_for_monthly['Total_No_of_Workers'].sum()}")
df_filtered_for_monthly['calendar_month'] = df_filtered_for_monthly['month'].astype(str).str.strip().str.title().astype('category')
# Updated month_order to reflect Indian financial year starting April
month_order_financial_year = ['April', 'May', 'June', 'July', 'August', 'September',
'October', 'November', 'December', 'January', 'February', 'March']
# Re-map short month names to full for consistent ordering and mapping
short_to_full_month = {
'Jan': 'January', 'Feb': 'February', 'Mar': 'March', 'Apr': 'April',
'May': 'May', 'Jun': 'June', 'Jul': 'July', 'Aug': 'August',
'Sep': 'September', 'Oct': 'October', 'Nov': 'November', 'Dec': 'December'
}
df_filtered_for_monthly['calendar_month'] = df_filtered_for_monthly['calendar_month'].map(short_to_full_month).fillna(df_filtered_for_monthly['calendar_month']) # Map short to full, keep full as is
df_filtered_for_monthly['calendar_month'] = pd.Categorical(df_filtered_for_monthly['calendar_month'], categories=month_order_financial_year, ordered=True) # Use new financial year month order
monthly_trends = df_filtered_for_monthly.groupby('calendar_month').agg(
Total_Expenditure=('Total_Exp', 'sum'),
Total_Workers=('Total_No_of_Workers', 'sum'),
Avg_Daily_Wage=('Average_Wage_rate_per_day_per_person', 'mean'),
Avg_100_Days_HH_Ratio=('100_Days_HH_Ratio', 'mean'),
Avg_Payment_Efficiency=('percentage_payments_gererated_within_15_days', 'mean')
).reindex(month_order_financial_year) # Order months for consistent plotting
monthly_trends = monthly_trends.fillna(0)
print("\n--- Monthly Trends (Aggregated Across All Years, for 2018-2024 FYs) ---") # Updated title
print(monthly_trends)
print("\n--- Monthly Trends Describe ---")
print(monthly_trends.describe())
fig, ax1 = plt.subplots(figsize=(15, 8))
color = 'tab:blue'
ax1.set_xlabel('Month', fontsize=12)
ax1.set_ylabel('Total Expenditure (Crores ₹)', color=color, fontsize=12)
line1_monthly = ax1.plot(monthly_trends.index, monthly_trends['Total_Expenditure'] / 1e7, color=color, marker='o', label='Total Expenditure')
ax1.tick_params(axis='y', labelcolor=color)
ax1.tick_params(axis='x', rotation=45)
ax1.grid(axis='y', linestyle='--', alpha=0.7)
# Annotate points for Total Expenditure (Monthly)
for x, y in zip(monthly_trends.index, monthly_trends['Total_Expenditure'] / 1e7):
ax1.annotate(f'{y:.1f} Cr', (x, y), textcoords="offset points", xytext=(0,10), ha='center', fontsize=8, color=color)
ax2 = ax1.twinx()
color = 'tab:orange'
ax2.set_ylabel('Total Workers (Millions)', color=color, fontsize=12)
line2_monthly = ax2.plot(monthly_trends.index, monthly_trends['Total_Workers'] / 1e6, color=color, marker='x', label='Total Workers')
ax2.tick_params(axis='y', labelcolor=color)
# Annotate points for Total Workers (Monthly)
for x, y in zip(monthly_trends.index, monthly_trends['Total_Workers'] / 1e6):
ax2.annotate(f'{y:.1f} M', (x, y), textcoords="offset points", xytext=(0,-15), ha='center', fontsize=8, color=color)
fig.suptitle('Monthly Trends: Total Expenditure and Workers Employed (2018-2024 FYs)', fontsize=16) # Updated title
lines_combined_monthly = line1_monthly + line2_monthly
labels_combined_monthly = [l.get_label() for l in lines_combined_monthly]
ax2.legend(lines_combined_monthly, labels_combined_monthly, loc='upper left', bbox_to_anchor=(0.05, 0.95))
fig.tight_layout()
plt.show()
# Comment for Monthly Trends: Total Expenditure and Workers Employed
"""
Purpose: To identify seasonal patterns in MGNREGA expenditure and worker engagement, aligned with the Indian financial year, using historically consistent data (2018-2024 FYs).
Insights: Reveals peak and lean periods of activity within the financial year.
Interpretation: Activity often aligns with agricultural cycles (e.g., lower demand during peak agricultural seasons, higher during lean periods). Understanding these cycles is crucial for efficient resource planning and matching work availability with demand. This plot aggregates data across all complete financial years. Annotated values provide precise figures.
"""
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
fig.suptitle('Monthly Trends: Average Wages, 100-Days HH Ratio, and Payment Efficiency (2018-2024 FYs)', fontsize=16) # Updated title
# Plot Avg_Daily_Wage by Month
sns.lineplot(x=monthly_trends.index, y='Avg_Daily_Wage', data=monthly_trends, marker='o', ax=axes[0], color='green')
axes[0].set_title('Average Daily Wage Rate by Month', fontsize=14)
axes[0].set_xlabel('Month', fontsize=12)
axes[0].set_ylabel('Average Daily Wage (₹)', fontsize=12)
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(axis='y', linestyle='--', alpha=0.7)
# Annotate points
for x, y in zip(monthly_trends.index, monthly_trends['Avg_Daily_Wage']):
axes[0].annotate(f'{y:.0f}', (x, y), textcoords="offset points", xytext=(0,10), ha='center', fontsize=8, color='green')
# Comment for Plot (Avg_Daily_Wage by Month)
"""
Purpose: To analyze monthly variations in average daily wage rates, aligned with the Indian financial year, for consistent historical data.
Insights: Can show if wages fluctuate seasonally or are relatively stable throughout the year.
Interpretation: Significant monthly fluctuations might indicate changes in work types or local economic conditions. Relative stability, especially near the median, suggests consistent wage policy application. Annotated values provide precise figures.
"""
# Plot Avg_100_Days_HH_Ratio by Month
sns.lineplot(x=monthly_trends.index, y='Avg_100_Days_HH_Ratio', data=monthly_trends, marker='o', ax=axes[1], color='purple')
axes[1].set_title('Average 100-Days HH Ratio by Month', fontsize=14)
axes[1].set_xlabel('Month', fontsize=12)
axes[1].set_ylabel('Average Ratio', fontsize=12)
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(axis='y', linestyle='--', alpha=0.7)
# Annotate points
for x, y in zip(monthly_trends.index, monthly_trends['Avg_100_Days_HH_Ratio']):
axes[1].annotate(f'{y:.3f}', (x, y), textcoords="offset points", xytext=(0,10), ha='center', fontsize=8, color='purple')
# Comment for Plot (Avg_100_Days_HH_Ratio by Month)
"""
Purpose: To identify monthly patterns in the provision of 100 days of employment, aligned with the Indian financial year, for consistent historical data.
Insights: Reveals if certain months are better or worse for achieving this key scheme objective.
Interpretation: Seasonal peaks might align with lean agricultural seasons when demand for work is highest. Understanding these patterns can help target interventions. Annotated values provide precise figures.
"""
# Plot Avg_Payment_Efficiency by Month
sns.lineplot(x=monthly_trends.index, y='Avg_Payment_Efficiency', data=monthly_trends, marker='o', ax=axes[2], color='brown')
axes[2].set_title('Average Payment Efficiency by Month', fontsize=14)
axes[2].set_xlabel('Month', fontsize=12)
axes[2].set_ylabel('Average Percentage (%)', fontsize=12)
axes[2].tick_params(axis='x', rotation=45)
axes[2].grid(axis='y', linestyle='--', alpha=0.7)
# Annotate points
for x, y in zip(monthly_trends.index, monthly_trends['Avg_Payment_Efficiency']):
axes[2].annotate(f'{y:.1f}%', (x, y), textcoords="offset points", xytext=(0,10), ha='center', fontsize=8, color='brown')
# Comment for Plot (Avg_Payment_Efficiency by Month)
"""
Purpose: To assess monthly consistency in payment efficiency, aligned with the Indian financial year, for consistent historical data.
Insights: Can highlight months where administrative bottlenecks or external factors might cause payment delays.
Interpretation: Stable high efficiency indicates a robust system. Dips can signal areas where the payment mechanism needs strengthening, especially if it coincides with peak work demand. Annotated values provide precise figures.
"""
plt.tight_layout(rect=[0, 0.03, 1, 0.96])
plt.show()
# --- New Visualization: Monthly Trends for Key Metrics by Financial Year ---
print("\n--- Monthly Trends by Financial Year for Key Metrics (2018-2024 FYs) ---") # Updated title
# Data preparation for the new plot: Group by fin_year and calendar_month
df_temp_time['calendar_month_for_plot'] = df_temp_time['month'].astype(str).str.strip().str.title().astype('category')
df_temp_time['calendar_month_for_plot'] = df_temp_time['calendar_month_for_plot'].map(short_to_full_month).fillna(df_temp_time['calendar_month_for_plot'])
df_temp_time['calendar_month_for_plot'] = pd.Categorical(df_temp_time['calendar_month_for_plot'], categories=financial_month_order_for_plot, ordered=True) # Use financial year month order
# Filter out the incomplete 2025-2026 data (already handled by df filtering)
monthly_yearly_trends_filtered = df_temp_time.groupby(['fin_year', 'calendar_month_for_plot']).agg(
Total_Expenditure=('Total_Exp', 'sum'),
Total_Workers=('Total_No_of_Workers', 'sum'),
Avg_Daily_Wage=('Average_Wage_rate_per_day_per_person', 'mean'),
Avg_100_Days_HH_Ratio=('100_Days_HH_Ratio', 'mean'),
Avg_Payment_Efficiency=('percentage_payments_gererated_within_15_days', 'mean')
).reset_index().rename(columns={'calendar_month_for_plot': 'Month'})
# Fill NaNs only for numerical columns in this aggregated DataFrame
numerical_cols_for_fill = ['Total_Expenditure', 'Total_Workers', 'Avg_Daily_Wage', 'Avg_100_Days_HH_Ratio', 'Avg_Payment_Efficiency']
for col in numerical_cols_for_fill:
if col in monthly_yearly_trends_filtered.columns:
monthly_yearly_trends_filtered[col] = monthly_yearly_trends_filtered[col].fillna(0)
# Plotting for Total Expenditure by Month for Each Year
plt.figure(figsize=(18, 10))
lineplot_exp = sns.lineplot(data=monthly_yearly_trends_filtered, x='Month', y='Total_Expenditure', hue='fin_year', marker='o', palette='tab10')
plt.title('Monthly Total Expenditure (Crores ₹) by Financial Year (2018-2024 FYs)', fontsize=16) # Updated title
plt.xlabel('Month', fontsize=12)
plt.ylabel('Total Expenditure (Crores ₹)', fontsize=12)
plt.tick_params(axis='x', rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.legend(title='Financial Year', bbox_to_anchor=(1.05, 1), loc='upper left')
# Annotate points for Monthly Total Expenditure
for line in lineplot_exp.get_lines():
for x_val, y_val in zip(line.get_xdata(), line.get_ydata()):
if pd.notna(y_val) and y_val != 0: # Only annotate non-NaN, non-zero values
plt.text(x_val, y_val, f'{y_val/1e7:.1f} Cr', fontsize=7, ha='center', va='bottom', color=line.get_color()) # Adjusted formatting
plt.tight_layout()
plt.show()
# Comment for Monthly Total Expenditure by Financial Year
"""
Purpose: To visualize the month-wise trends of total expenditure for each financial year. Excludes anomalous years. Annotated values provide precise figures at each node.
Insights: Helps in identifying year-specific anomalies or deviations from typical seasonal patterns. Reveals if program scale increased in certain months in specific years.
Interpretation: For example, a sudden spike in expenditure during a particular month in one year might correspond to specific project rollouts or disaster relief efforts. This granular view helps in understanding yearly operational nuances.
"""
# Plotting for Total Workers by Month for Each Year
plt.figure(figsize=(18, 10))
lineplot_workers = sns.lineplot(data=monthly_yearly_trends_filtered, x='Month', y='Total_Workers', hue='fin_year', marker='x', palette='tab10')
plt.title('Monthly Total Workers (Millions) by Financial Year (2018-2024 FYs)', fontsize=16) # Updated title
plt.xlabel('Month', fontsize=12)
plt.ylabel('Total Workers (Millions)', fontsize=12)
plt.tick_params(axis='x', rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.legend(title='Financial Year', bbox_to_anchor=(1.05, 1), loc='upper left')
# Annotate points for Monthly Total Workers
for line in lineplot_workers.get_lines():
for x_val, y_val in zip(line.get_xdata(), line.get_ydata()):
if pd.notna(y_val) and y_val != 0: # Only annotate non-NaN, non-zero values
plt.text(x_val, y_val, f'{y_val/1e6:.1f} M', fontsize=7, ha='center', va='bottom', color=line.get_color()) # Adjusted formatting
plt.tight_layout()
plt.show()
# Comment for Monthly Total Workers by Financial Year
"""
Purpose: To visualize the month-wise trends of total workers employed for each financial year. Excludes anomalous years. Annotated values provide precise figures at each node.
Insights: Helps in identifying seasonal employment patterns and how they might vary year-on-year.
Interpretation: Shows how demand for work or availability of projects fluctuates monthly within each financial year. This is vital for understanding employment stability and planning resource allocation.
"""
# --- Geospatial Analysis (Choropleth Maps) ---
print("\n--- Starting Geospatial Analysis ---")
# Load India's state boundaries
try:
# IMPORTANT: VERIFY YOUR GEOJSON FILE PATH AND FILENAME!
if not os.path.exists(GEOJSON_FILE):
raise FileNotFoundError(f"GeoJSON file not found at: {GEOJSON_FILE}")
india_states_gdf = geopandas.read_file(GEOJSON_FILE)
print(f"Loaded GeoJSON from: {GEOJSON_FILE}")
print(f"GeoDataFrame columns: {india_states_gdf.columns.tolist()}")
# Check if the loaded GeoJSON is state-level or district-level
is_state_level_geojson = 'NAME_1' in india_states_gdf.columns and 'NAME_2' not in india_states_gdf.columns
if not is_state_level_geojson:
print("Warning: Loaded GeoJSON appears to be district-level or different structure. Attempting to dissolve to state level.")
if 'NAME_1' in india_states_gdf.columns:
# Dissolve by NAME_1 to get state boundaries from a district-level file
india_states_gdf['State_Clean_for_Dissolve'] = india_states_gdf['NAME_1'].astype(str).str.upper().str.replace('&', 'AND').str.replace('.', '').str.replace(' ', '').str.strip()
india_states_gdf = india_states_gdf.dissolve(by='State_Clean_for_Dissolve')
# After dissolve, the 'NAME_1' column might become the index or a regular column depending on geopandas version
if 'State_Clean_for_Dissolve' in india_states_gdf.index.names:
india_states_gdf = india_states_gdf.reset_index()
print("Successfully dissolved district-level GeoJSON to state boundaries.")
else:
print("Error: Cannot dissolve GeoJSON to state level. 'NAME_1' column not found. Map plotting might fail or be incorrect.")
india_states_gdf['State_Clean'] = ''
# IMPORTANT: Consistent state name cleaning for both GeoDataFrame and aggregated data
# Standardize names: remove spaces, replace '&' with 'AND', remove dots.
if 'NAME_1' in india_states_gdf.columns:
india_states_gdf['State_Clean'] = india_states_gdf['NAME_1'].astype(str).str.upper().str.replace('&', 'AND').str.replace('.', '').str.replace(' ', '').str.strip()
elif 'State_Clean_for_Dissolve' in india_states_gdf.columns:
india_states_gdf['State_Clean'] = india_states_gdf['State_Clean_for_Dissolve'].astype(str).str.upper().str.replace('&', 'AND').str.replace('.', '').str.replace(' ', '').str.strip()
elif 'NAME_LOCAL' in india_states_gdf.columns:
india_states_gdf['State_Clean'] = india_states_gdf['NAME_LOCAL'].astype(str).str.upper().str.replace('&', 'AND').str.replace('.', '').str.replace(' ', '').str.strip()
else:
print("Warning: Could not find standard state name column in GeoJSON for final cleaning. Merge might fail.")
india_states_gdf['State_Clean'] = india_states_gdf.index.astype(str).str.upper().str.replace('&', 'AND').str.replace('.', '').str.replace(' ', '').str.strip()
state_performance_reset = state_performance.reset_index()
state_performance_reset['State_Clean'] = state_performance_reset['State'].astype(str).str.upper().str.replace('&', 'AND').str.replace('.', '').str.replace(' ', '').str.strip()
# Debugging state name mismatches
mgnrega_states = set(state_performance_reset['State_Clean'].unique())
geojson_states = set(india_states_gdf['State_Clean'].unique())
print(f"\nStates in MGNREGA data: {sorted(list(mgnrega_states))}")
print(f"States in GeoJSON data: {sorted(list(geojson_states))}")
print(f"States in MGNREGA but not in GeoJSON: {mgnrega_states - geojson_states}")
print(f"States in GeoJSON but not in MGNREGA: {geojson_states - mgnrega_states}")
common_states = list(mgnrega_states.intersection(geojson_states))
if not common_states:
print("Warning: No common states found between MGNREGA data and GeoJSON. Map plotting might be empty or incorrect.")
print("This is often due to naming inconsistencies. Please review the 'States in MGNREGA data' and 'States in GeoJSON data' lists above.")
merged_gdf = india_states_gdf.merge(state_performance_reset, left_on='State_Clean', right_on='State_Clean', how='left')
for col in state_performance.columns:
if col in merged_gdf.columns:
if merged_gdf[col].isnull().any():
merged_gdf[col] = merged_gdf[col].fillna(0)
# Plot Choropleth Maps for key metrics
fig, axes = plt.subplots(3, 2, figsize=(20, 25))
fig.suptitle('Geographical Distribution of Key MGNREGA Performance Indicators by State', fontsize=20)
map_metrics = [
('Total_Expenditure', 'Total Expenditure (Crores ₹)', plt.cm.viridis),
('Total_Workers', 'Total Workers (Millions)', plt.cm.magma),
('Avg_Daily_Wage', 'Average Daily Wage (₹)', plt.cm.plasma),
('Avg_100_Days_HH_Ratio', 'Average 100-Days HH Ratio', plt.cm.coolwarm),
('Avg_Payment_Efficiency', 'Average Payment Efficiency (%)', plt.cm.OrRd),
('Avg_Women_Persondays_Ratio', 'Average Women Persondays Ratio', plt.cm.viridis)
]
axes = axes.flatten()
for i, (metric, title, cmap) in enumerate(map_metrics):
ax = axes[i]
plot_data = merged_gdf[metric]
if 'Expenditure' in title:
plot_data = plot_data / 1e7
elif 'Workers' in title:
plot_data = plot_data / 1e6
vmin, vmax = None, None
if 'Ratio' in title or 'Percentage' in title:
vmin = 0
vmax = 1.0 if 'Ratio' in title else 100.0
# Ensure plot_data has non-NaN values for mapping, fill with 0 if needed for states with no data
plot_data_for_map = plot_data.fillna(0) # Temporarily fill for plotting color, missing_kwds still takes over for legend
merged_gdf.plot(column=plot_data_for_map, cmap=cmap, linewidth=0.8, ax=ax, edgecolor='0.8', legend=True,
legend_kwds={'label': title, 'orientation': "horizontal", 'shrink': 0.7},
missing_kwds={"color": "lightgrey", "label": "No Data"},
vmin=vmin, vmax=vmax
)
ax.set_title(title, fontsize=16)
ax.set_axis_off()
# Add state names as text labels (selectively for larger states for clarity)
# Use merged_gdf['State'] for original state names if available
for idx, row in merged_gdf.iterrows():
# Only label states that have data and are large enough for the label to be clear
# Filter for larger states or states with more central geometry to avoid clutter
if pd.notna(row[metric]) and row[metric] > 0 and row['State_Clean'] in common_states:
# Heuristic for selective labeling on India map to prevent clutter
if row['State_Clean'] in ['UTTARPRADESH', 'MADHYAPRADESH', 'RAJASTHAN', 'MAHARASHTRA',
'ANDHRAPRADESH', 'KARNATAKA', 'GUJARAT', 'ODISHA',
'CHHATTISGARH', 'WESTBENGAL', 'BIHAR', 'ASSAM', 'KERALA',
'PUNJAB', 'HARYANA', 'JHARKHAND', 'UTTARAKHAND']:
try:
centroid = row.geometry.centroid
# Adjust text properties based on expected background color from colormap or general contrast
# For viridis/magma/plasma, black text is generally good. For OrRd, black/dark color needed.
ax.text(centroid.x, centroid.y, row['State'], fontsize=7, ha='center', va='center', color='black', alpha=0.9,
bbox=dict(boxstyle="round,pad=0.1", fc='white', ec='none', alpha=0.5)) # Add bbox for readability
except Exception as e:
print(f"Could not add label for {row['State']}: {e}")
plt.tight_layout(rect=[0, 0.03, 1, 0.98])
plt.show()
print("\n--- Geospatial Analysis Completed ---")
except FileNotFoundError:
print(f"Error: GeoJSON file '{GEOJSON_FILE}' not found. Skipping geospatial analysis.")
print("Please ensure you have the GeoJSON file for Indian administrative boundaries in the specified path.")
print("Example: ML_CA1/Datasets/GeoJSON/gadm41_IND_1.json (for states) or gadm41_IND_2.json (if you want to dissolve districts to states).")
except Exception as e:
print(f"An unexpected error occurred during geospatial analysis: {e}")
print("Skipping geospatial analysis. Error details:")
import traceback
traceback.print_exc()
# --- Bivariate Analysis: Correlation Matrix ---
print("\n--- Correlation Matrix of Key Numerical Metrics ---")
correlation_cols = [
'Approved_Labour_Budget', 'Total_Exp', 'Wages', 'Persondays_of_Central_Liability_so_far',
'Total_No_of_Workers', 'Total_Households_Worked', 'Total_Individuals_Worked',
'Average_Wage_rate_per_day_per_person', 'percentage_payments_gererated_within_15_days',
'Women_Persondays_Ratio', 'SC_Persondays_Ratio', 'ST_Persondays_Ratio',
'100_Days_HH_Ratio', 'percent_of_NRM_Expenditure', 'percent_of_Expenditure_on_Agriculture_Allied_Works'
]
correlation_matrix = df[correlation_cols].corr()
plt.figure(figsize=(14, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix of Key MGNREGA Metrics', fontsize=16)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()
# Comment for Correlation Matrix
"""
Purpose: To quantitatively assess the linear relationships between various key numerical metrics.
Insights: High correlation (close to +1 or -1) indicates that two variables tend to move together or in opposite directions.
Interpretation: For instance, a strong positive correlation between 'Total_Exp' and 'Total_Workers' would indicate that higher spending leads to more employment. This helps identify interdependencies and potential drivers for different aspects of the scheme. Weak correlations might suggest that metrics are independent, or that non-linear relationships exist.
"""
print("\n--- EDA Step 3 Completed. Dataset is ready for in-depth analysis and insights. ---")
Loaded fully cleaned data from: Datasets\Cleaned_Preprocessed\mgnrega_data_fully_cleaned.csv
DataFrame shape after robust numerical handling: (50892, 40)
--- Filtering Data for Consistent Historical Analysis (Excluding 2024-2025 and 2025-2026) ---
DataFrame shape after filtering ['2024-2025', '2025-2026']: (50892, 40)
Financial years remaining in data: ['2018-2019' '2019-2020' '2020-2021' '2021-2022' '2022-2023' '2023-2024']
--- Sorting DataFrame by Financial Year, Month (Financial Order), State Code, and District Code ---
DataFrame sorted successfully.
fin_year month state_code State district_code \
0 2018-2019 April 1 ANDAMAN AND NICOBAR 101
1 2018-2019 April 1 ANDAMAN AND NICOBAR 102
2 2018-2019 April 1 ANDAMAN AND NICOBAR 103
3 2018-2019 April 2 ANDHRA PRADESH 201
4 2018-2019 April 2 ANDHRA PRADESH 202
5 2018-2019 April 2 ANDHRA PRADESH 203
6 2018-2019 April 2 ANDHRA PRADESH 204
7 2018-2019 April 2 ANDHRA PRADESH 205
8 2018-2019 April 2 ANDHRA PRADESH 206
9 2018-2019 April 2 ANDHRA PRADESH 207
10 2018-2019 April 2 ANDHRA PRADESH 208
11 2018-2019 April 2 ANDHRA PRADESH 209
12 2018-2019 April 2 ANDHRA PRADESH 210
13 2018-2019 April 2 ANDHRA PRADESH 211
14 2018-2019 April 2 ANDHRA PRADESH 212
15 2018-2019 April 2 ANDHRA PRADESH 213
16 2018-2019 April 3 ARUNACHAL PRADESH 301
17 2018-2019 April 3 ARUNACHAL PRADESH 302
18 2018-2019 April 3 ARUNACHAL PRADESH 303
19 2018-2019 April 3 ARUNACHAL PRADESH 304
District
0 SOUTH ANDAMAN
1 NICOBARS
2 NORTH AND MIDDLE ANDAMAN
3 SRIKAKULAM
4 VIZIANAGARAM
5 VISAKHAPATANAM
6 EAST GODAVARI
7 WEST GODAVARI
8 KRISHNA
9 GUNTUR
10 PRAKASAM
11 NELLORE
12 CHITTOOR
13 Y.S.R
14 ANANTAPUR
15 KURNOOL
16 TAWANG
17 WEST KAMENG
18 EAST KAMENG
19 LOWER SUBANSIRI
Filtered data for analysis saved to: Datasets\Cleaned_Preprocessed\mgnrega_data_filtered_for_analysis.csv
--- Starting EDA Step 3: Refined Visualizations and Deep Dive into Trends and Patterns ---
--- Feature Engineering Completed ---
--- State-wise Aggregated Metrics ---
--- State Performance Aggregated Data (Head) ---
Total_Expenditure Total_Workers Avg_Daily_Wage \
State
ANDAMAN AND NICOBAR 2.324324e+04 4.157408e+06 460.459545
ANDHRA PRADESH 4.084342e+07 1.289226e+09 207.741754
ARUNACHAL PRADESH 1.436587e+06 3.570200e+07 441.097322
ASSAM 7.246806e+06 7.832448e+08 209.195901
BIHAR 2.381411e+07 1.836347e+09 182.112772
Avg_100_Days_HH_Ratio Avg_Payment_Efficiency \
State
ANDAMAN AND NICOBAR 0.006548 71.759259
ANDHRA PRADESH 0.046128 99.982532
ARUNACHAL PRADESH 0.001937 74.132324
ASSAM 0.004152 99.809024
BIHAR 0.002771 99.994145
Total_Women_Persondays Avg_Women_Persondays_Ratio \
State
ANDAMAN AND NICOBAR 3.637515e+06 0.526728
ANDHRA PRADESH 7.420281e+09 0.587645
ARUNACHAL PRADESH 1.768506e+08 0.387847
ASSAM 1.350378e+09 0.450713
BIHAR 4.431626e+09 0.532949
Avg_NRM_Exp_Percent
State
ANDAMAN AND NICOBAR 44.065602
ANDHRA PRADESH 65.776250
ARUNACHAL PRADESH 39.173804
ASSAM 56.364368
BIHAR 38.235881
--- State Performance Aggregated Data (Describe) ---
Total_Expenditure Total_Workers Avg_Daily_Wage \
count 3.400000e+01 3.400000e+01 34.000000
mean 1.102063e+07 6.200833e+08 270.371549
std 1.258963e+07 7.058387e+08 149.850695
min 3.299419e+02 1.174792e+06 155.342062
25% 1.991391e+06 4.070123e+07 186.394592
50% 4.150339e+06 2.019835e+08 212.583944
75% 1.734133e+07 1.141402e+09 276.175995
max 4.084342e+07 2.197404e+09 864.677323
Avg_100_Days_HH_Ratio Avg_Payment_Efficiency Total_Women_Persondays \
count 34.000000 34.000000 3.400000e+01
mean 0.021129 92.198627 2.132984e+09
std 0.019542 18.583462 2.855728e+09
min 0.000000 6.944444 5.353100e+04
25% 0.006618 95.998545 2.608064e+08
50% 0.013206 99.801665 7.725087e+08
75% 0.033456 99.979579 2.980862e+09
max 0.064929 100.000000 1.122258e+10
Avg_Women_Persondays_Ratio Avg_NRM_Exp_Percent
count 34.000000 34.000000
mean 0.506298 55.412519
std 0.170180 13.989864
min 0.077807 8.152778
25% 0.428733 50.184504
50% 0.499531 56.584258
75% 0.586700 61.543056
max 0.891152 85.012500
--- Trends Over Financial Years ---
Yearly Trends (Operating on filtered data):
Total_Expenditure Total_Workers Avg_Daily_Wage \
fin_year
2018-2019 4.332698e+07 3.307431e+09 215.354496
2019-2020 4.202751e+07 3.374215e+09 225.300330
2020-2021 7.354912e+07 3.713326e+09 231.115160
2021-2022 7.468682e+07 3.837848e+09 232.384528
2022-2023 6.748102e+07 3.671221e+09 274.078549
2023-2024 7.362989e+07 3.178792e+09 269.381790
Avg_100_Days_HH_Ratio Avg_Payment_Efficiency
fin_year
2018-2019 0.021104 95.018298
2019-2020 0.018640 95.453846
2020-2021 0.027561 97.118983
2021-2022 0.024389 97.668608
2022-2023 0.016337 98.318128
2023-2024 0.019112 97.457468
--- Debugging df_filtered_for_monthly before Monthly Aggregation ---
Shape: (50892, 45)
month_num_fy_order nulls: 0
calendar_year_temp nulls: 0
Financial years present: ['2018-2019' '2019-2020' '2020-2021' '2021-2022' '2022-2023' '2023-2024']
Sample of data for monthly trends:
fin_year month Total_Exp Total_No_of_Workers \
0 2018-2019 April 4.369320 18558.0
1 2018-2019 April 0.000000 9786.0
2 2018-2019 April 0.150230 32903.0
3 2018-2019 April 6662.301165 1320134.0
4 2018-2019 April 6176.642097 1222586.0
Average_Wage_rate_per_day_per_person
0 37.013536
1 252.492009
2 252.492009
3 140.062250
4 123.668847
Sum of Total_Exp in df_filtered_for_monthly: 374701344.0047136
Sum of Total_No_of_Workers in df_filtered_for_monthly: 21082833425.0
--- Monthly Trends (Aggregated Across All Years, for 2018-2024 FYs) ---
Total_Expenditure Total_Workers Avg_Daily_Wage \
calendar_month
April 3.501258e+06 1.759502e+09 306.991754
May 9.071760e+06 1.763155e+09 252.167022
June 1.613888e+07 1.765832e+09 269.522965
July 2.221178e+07 1.767538e+09 258.861407
August 2.722379e+07 1.767046e+09 250.516487
September 3.121897e+07 1.767238e+09 239.753141
October 3.486475e+07 1.766182e+09 232.320477
November 3.831889e+07 1.764144e+09 225.823328
December 4.218389e+07 1.755753e+09 221.710547
January 4.567729e+07 1.744826e+09 217.258905
February 4.909899e+07 1.736200e+09 210.659327
March 5.519109e+07 1.725416e+09 218.892321
Avg_100_Days_HH_Ratio Avg_Payment_Efficiency
calendar_month
April 0.000166 91.398682
May 0.001401 93.784956
June 0.004216 95.247326
July 0.006523 96.216331
August 0.008978 97.091014
September 0.012111 97.609672
October 0.015836 98.031049
November 0.020845 98.314447
December 0.028194 98.429422
January 0.036863 98.617868
February 0.049889 98.781608
March 0.069138 99.086138
--- Monthly Trends Describe ---
Total_Expenditure Total_Workers Avg_Daily_Wage \
count 1.200000e+01 1.200000e+01 12.000000
mean 3.122511e+07 1.756903e+09 242.039807
std 1.615437e+07 1.399384e+07 27.538471
min 3.501258e+06 1.725416e+09 210.659327
25% 2.069355e+07 1.753021e+09 221.005990
50% 3.304186e+07 1.763649e+09 236.036809
75% 4.305724e+07 1.766398e+09 253.840618
max 5.519109e+07 1.767538e+09 306.991754
Avg_100_Days_HH_Ratio Avg_Payment_Efficiency
count 12.000000 12.000000
mean 0.021180 96.884043
std 0.021351 2.347021
min 0.000166 91.398682
25% 0.005946 95.974080
50% 0.013973 97.820361
75% 0.030361 98.476534
max 0.069138 99.086138
--- Monthly Trends by Financial Year for Key Metrics (2018-2024 FYs) ---
--- Starting Geospatial Analysis ---
Loaded GeoJSON from: Datasets\GEOJSON\gadm41_IND_1.json
GeoDataFrame columns: ['GID_1', 'GID_0', 'COUNTRY', 'NAME_1', 'VARNAME_1', 'NL_NAME_1', 'TYPE_1', 'ENGTYPE_1', 'CC_1', 'HASC_1', 'ISO_1', 'geometry']
States in MGNREGA data: ['ANDAMANANDNICOBAR', 'ANDHRAPRADESH', 'ARUNACHALPRADESH', 'ASSAM', 'BIHAR', 'CHHATTISGARH', 'DNHAVELIANDDD', 'GOA', 'GUJARAT', 'HARYANA', 'HIMACHALPRADESH', 'JAMMUANDKASHMIR', 'JHARKHAND', 'KARNATAKA', 'KERALA', 'LADAKH', 'LAKSHADWEEP', 'MADHYAPRADESH', 'MAHARASHTRA', 'MANIPUR', 'MEGHALAYA', 'MIZORAM', 'NAGALAND', 'ODISHA', 'PUDUCHERRY', 'PUNJAB', 'RAJASTHAN', 'SIKKIM', 'TAMILNADU', 'TELANGANA', 'TRIPURA', 'UTTARAKHAND', 'UTTARPRADESH', 'WESTBENGAL']
States in GeoJSON data: ['ANDAMANANDNICOBAR', 'ANDHRAPRADESH', 'ARUNACHALPRADESH', 'ASSAM', 'BIHAR', 'CHANDIGARH', 'CHHATTISGARH', 'DADRAANDNAGARHAVELI', 'DAMANANDDIU', 'GOA', 'GUJARAT', 'HARYANA', 'HIMACHALPRADESH', 'JAMMUANDKASHMIR', 'JHARKHAND', 'KARNATAKA', 'KERALA', 'LAKSHADWEEP', 'MADHYAPRADESH', 'MAHARASHTRA', 'MANIPUR', 'MEGHALAYA', 'MIZORAM', 'NAGALAND', 'NCTOFDELHI', 'ODISHA', 'PUDUCHERRY', 'PUNJAB', 'RAJASTHAN', 'SIKKIM', 'TAMILNADU', 'TELANGANA', 'TRIPURA', 'UTTARAKHAND', 'UTTARPRADESH', 'WESTBENGAL']
States in MGNREGA but not in GeoJSON: {'LADAKH', 'DNHAVELIANDDD'}
States in GeoJSON but not in MGNREGA: {'DADRAANDNAGARHAVELI', 'DAMANANDDIU', 'NCTOFDELHI', 'CHANDIGARH'}
--- Geospatial Analysis Completed --- --- Correlation Matrix of Key Numerical Metrics ---
--- EDA Step 3 Completed. Dataset is ready for in-depth analysis and insights. ---
In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
# geopandas is not strictly necessary for single-district temporal trends,
# but keeping for completeness if future geospatial desires emerge.
# import geopandas
# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')
# --- Configuration for File Paths ---
# Assuming the project root is where your main script runs from (e.g., 'ML_CA1')
PROJECT_ROOT = ""
CLEANED_DATA_DIR = os.path.join(PROJECT_ROOT, "Datasets", "Cleaned_Preprocessed")
# This is the CSV file containing the overall data, filtered for historical consistency
INPUT_DATA_FILE = os.path.join(CLEANED_DATA_DIR, "mgnrega_data_filtered_for_analysis.csv")
# --- Step 1: Load Data and Filter for Nagpur District ---
print("--- Starting Focused Analysis of Nagpur District ---")
print("\n--- Step 1: Loading Data and Filtering for Nagpur District ---")
try:
df_raw = pd.read_csv(INPUT_DATA_FILE)
print(f"Loaded filtered data from: {INPUT_DATA_FILE}")
print(f"Initial DataFrame shape: {df_raw.shape}")
except FileNotFoundError:
print(f"Error: Input data file '{INPUT_DATA_FILE}' not found. Please ensure the path is correct.")
exit()
except Exception as e:
print(f"An unexpected error occurred during data loading: {e}")
exit()
# Filter for Nagpur district only (ensure exact spelling matches your data)
NAGPUR_DISTRICT_NAME = 'NAGPUR' # Assuming 'NAGPUR' is the exact district name in your 'District' column
df_nagpur = df_raw[df_raw['District'] == NAGPUR_DISTRICT_NAME].copy()
if df_nagpur.empty:
print(f"Error: No data found for district '{NAGPUR_DISTRICT_NAME}'. Please check the district name or your input data file.")
print(f"Available districts: {df_raw['District'].unique()}")
exit()
print(f"Filtered data for Nagpur district. Shape: {df_nagpur.shape}")
print(f"Financial years included for Nagpur: {df_nagpur['fin_year'].unique()}")
# --- Step 2: Feature Engineering and Temporal Key Derivation (for Nagpur Subset) ---
print("\n--- Step 2: Feature Engineering and Temporal Key Derivation for Nagpur ---")
# Define month order for Indian Financial Year (April to March)
FINANCIAL_MONTH_ORDER = ['April', 'May', 'June', 'July', 'August', 'September',
'October', 'November', 'December', 'January', 'February', 'March']
# Robust mapping for month names (handles short forms too)
MONTH_TO_NUM_FOR_FY = {
'April': 0, 'May': 1, 'June': 2, 'July': 3, 'August': 4, 'September': 5,
'October': 6, 'November': 7, 'December': 8, 'January': 9, 'February': 10, 'March': 11,
'Apr': 0, 'May': 1, 'Jun': 2, 'Jul': 3, 'Aug': 4, 'Sep': 5,
'Oct': 6, 'Nov': 7, 'Dec': 8, 'Jan': 9, 'Feb': 10, 'Mar': 11
}
# 2.1: Temporal Keys for Sorting and Aggregation
df_nagpur['month_processed'] = df_nagpur['month'].astype(str).str.strip().str.title()
df_nagpur['month_full_name'] = df_nagpur['month_processed'].apply(lambda x: next((m_full for m_full in FINANCIAL_MONTH_ORDER if m_full.startswith(x)), x))
df_nagpur['month_num_fy_order'] = df_nagpur['month_full_name'].map(MONTH_TO_NUM_FOR_FY)
df_nagpur['start_year_temp'] = df_nagpur['fin_year'].astype(str).str.split('-').str[0].astype(int)
# Drop rows with NaNs in temporal keys (should be minimal)
df_nagpur.dropna(subset=['month_num_fy_order', 'start_year_temp'], inplace=True)
# 2.2: Engineered Ratio Features
df_nagpur['Women_Persondays_Ratio'] = df_nagpur['Women_Persondays'] / df_nagpur['Persondays_of_Central_Liability_so_far']
df_nagpur['Women_Persondays_Ratio'] = df_nagpur['Women_Persondays_Ratio'].replace([np.inf, -np.inf], np.nan).fillna(0)
df_nagpur['Women_Persondays_Ratio'] = np.clip(df_nagpur['Women_Persondays_Ratio'], 0, 1)
df_nagpur['SC_Persondays_Ratio'] = df_nagpur['SC_persondays'] / df_nagpur['Persondays_of_Central_Liability_so_far']
df_nagpur['SC_Persondays_Ratio'] = df_nagpur['SC_Persondays_Ratio'].replace([np.inf, -np.inf], np.nan).fillna(0)
df_nagpur['SC_Persondays_Ratio'] = np.clip(df_nagpur['SC_Persondays_Ratio'], 0, 1)
df_nagpur['ST_Persondays_Ratio'] = df_nagpur['ST_persondays'] / df_nagpur['Persondays_of_Central_Liability_so_far']
df_nagpur['ST_Persondays_Ratio'] = df_nagpur['ST_Persondays_Ratio'].replace([np.inf, -np.inf], np.nan).fillna(0)
df_nagpur['ST_Persondays_Ratio'] = np.clip(df_nagpur['ST_Persondays_Ratio'], 0, 1)
df_nagpur['100_Days_HH_Ratio'] = df_nagpur['Total_No_of_HHs_completed_100_Days_of_Wage_Employment'] / df_nagpur['Total_Households_Worked']
df_nagpur['100_Days_HH_Ratio'] = df_nagpur['100_Days_HH_Ratio'].replace([np.inf, -np.inf], np.nan).fillna(0)
df_nagpur['100_Days_HH_Ratio'] = np.clip(df_nagpur['100_Days_HH_Ratio'], 0, 1)
print("Feature engineering and temporal keys successfully derived for Nagpur data.")
# Convert relevant columns to categorical for efficiency
df_nagpur['fin_year'] = df_nagpur['fin_year'].astype('category')
df_nagpur['month_full_name'] = pd.Categorical(df_nagpur['month_full_name'], categories=FINANCIAL_MONTH_ORDER, ordered=True)
# Drop redundant temp columns
df_nagpur.drop(columns=['month_processed'], inplace=True, errors='ignore')
# --- Step 3: Initial Data Overview for Nagpur ---
print("\n--- Step 3: Initial Data Overview for Nagpur ---")
print(df_nagpur.info())
print("\n--- Sample Data for Nagpur (Head) ---")
print(df_nagpur.head())
print("\n--- Descriptive Statistics for Numerical Features in Nagpur ---")
numerical_cols_nagpur = df_nagpur.select_dtypes(include=np.number).columns.tolist()
desc_stats_nagpur = df_nagpur[numerical_cols_nagpur].describe().T
print(desc_stats_nagpur)
print("\n--- Count of Zero Values in Key Numerical Columns for Nagpur ---")
for col in ['Total_Exp', 'Total_No_of_Workers', 'Approved_Labour_Budget', 'Average_Wage_rate_per_day_per_person', '100_Days_HH_Ratio']:
if col in df_nagpur.columns:
num_zeros = (df_nagpur[col] == 0).sum()
if num_zeros > 0:
print(f"'{col}': {num_zeros} zeros ({num_zeros/len(df_nagpur)*100:.2f}%)")
# --- Step 4: Temporal Trend Analysis for Nagpur District ---
print("\n--- Step 4: Analyzing Temporal Trends for Nagpur District ---")
# Aggregation for Yearly Trends for Nagpur
nagpur_yearly_trends = df_nagpur.groupby('fin_year').agg(
Total_Expenditure=('Total_Exp', 'sum'),
Total_Workers=('Total_No_of_Workers', 'sum'),
Avg_Daily_Wage=('Average_Wage_rate_per_day_per_person', 'mean'),
Avg_100_Days_HH_Ratio=('100_Days_HH_Ratio', 'mean'),
Avg_Payment_Efficiency=('percentage_payments_gererated_within_15_days', 'mean')
).sort_values(by='fin_year')
print("\nNagpur Yearly Trends (Overall):")
print(nagpur_yearly_trends)
# Plotting Overall Nagpur Yearly Trends
fig, ax1 = plt.subplots(figsize=(15, 8))
# Plot Total Expenditure
color = 'tab:blue'
ax1.set_xlabel('Financial Year', fontsize=12)
ax1.set_ylabel('Total Expenditure (Crores ₹)', color=color, fontsize=12)
line1 = ax1.plot(nagpur_yearly_trends.index, nagpur_yearly_trends['Total_Expenditure'] / 1e7, color=color, marker='o', label='Total Expenditure')
ax1.tick_params(axis='y', labelcolor=color)
ax1.tick_params(axis='x', rotation=45)
ax1.grid(axis='y', linestyle='--', alpha=0.7)
for x, y in zip(nagpur_yearly_trends.index, nagpur_yearly_trends['Total_Expenditure'] / 1e7):
ax1.annotate(f'{y:.1f} Cr', (x, y), textcoords="offset points", xytext=(0,10), ha='center', fontsize=8, color=color)
# Plot Total Workers on a twin axis
ax2 = ax1.twinx()
color = 'tab:orange'
ax2.set_ylabel('Total Workers (Millions)', color=color, fontsize=12)
line2 = ax2.plot(nagpur_yearly_trends.index, nagpur_yearly_trends['Total_Workers'] / 1e6, color=color, marker='x', label='Total Workers')
ax2.tick_params(axis='y', labelcolor=color)
for x, y in zip(nagpur_yearly_trends.index, nagpur_yearly_trends['Total_Workers'] / 1e6):
ax2.annotate(f'{y:.1f} M', (x, y), textcoords="offset points", xytext=(0,-15), ha='center', fontsize=8, color=color)
fig.suptitle('Nagpur Yearly Trends: Total Expenditure and Workers Employed', fontsize=16)
lines_combined = line1 + line2
labels_combined = [l.get_label() for l in lines_combined]
ax2.legend(lines_combined, labels_combined, loc='upper left', bbox_to_anchor=(0.05, 0.95))
fig.tight_layout()
plt.show()
"""
Purpose: To visualize overall trends of expenditure and employment under MGNREGA specifically for Nagpur district.
Insights: Provides insight into Nagpur's long-term commitment and activity levels in the scheme.
Interpretation: An increasing trend indicates program expansion, while a decline might signal reduced demand or implementation challenges specific to Nagpur.
"""
# Separate plots for other yearly trends for Nagpur
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
fig.suptitle('Nagpur Yearly Trends: Average Wages, 100-Days HH Ratio, and Payment Efficiency', fontsize=16)
# Plot Avg_Daily_Wage by Year
sns.lineplot(x=nagpur_yearly_trends.index, y='Avg_Daily_Wage', data=nagpur_yearly_trends, marker='o', ax=axes[0], color='green')
axes[0].set_title('Average Daily Wage Rate by Year', fontsize=14)
axes[0].set_xlabel('Financial Year', fontsize=12)
axes[0].set_ylabel('Average Daily Wage (₹)', fontsize=12)
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(axis='y', linestyle='--', alpha=0.7)
for x, y in zip(nagpur_yearly_trends.index, nagpur_yearly_trends['Avg_Daily_Wage']):
axes[0].annotate(f'{y:.0f}', (x, y), textcoords="offset points", xytext=(0,10), ha='center', fontsize=8, color='green')
"""
Purpose: To track the trend of average daily wages provided under MGNREGA in Nagpur over time.
Insights: Shows whether worker income is increasing or stagnating in Nagpur.
Interpretation: A consistent upward trend is a positive sign for improving livelihood standards. Any stagnation or decrease would warrant concern and policy review specific to Nagpur.
"""
# Plot Avg_100_Days_HH_Ratio by Year
sns.lineplot(x=nagpur_yearly_trends.index, y='Avg_100_Days_HH_Ratio', data=nagpur_yearly_trends, marker='o', ax=axes[1], color='purple')
axes[1].set_title('Average 100-Days HH Ratio by Year', fontsize=14)
axes[1].set_xlabel('Financial Year', fontsize=12)
axes[1].set_ylabel('Average Ratio', fontsize=12)
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(axis='y', linestyle='--', alpha=0.7)
for x, y in zip(nagpur_yearly_trends.index, nagpur_yearly_trends['Avg_100_Days_HH_Ratio']):
axes[1].annotate(f'{y:.3f}', (x, y), textcoords="offset points", xytext=(0,10), ha='center', fontsize=8, color='purple')
"""
Purpose: To understand the longitudinal effectiveness of providing the full 100 days of guaranteed work in Nagpur.
Insights: Reveals the scheme's success in delivering its core promise locally. Low or declining trends highlight challenges.
Interpretation: Observed trends in this ratio are critical for policy attention in Nagpur, suggesting whether households are receiving the full guarantee.
"""
# Plot Avg_Payment_Efficiency by Year
sns.lineplot(x=nagpur_yearly_trends.index, y='Avg_Payment_Efficiency', data=nagpur_yearly_trends, marker='o', ax=axes[2], color='brown')
axes[2].set_title('Average Payment Efficiency by Year', fontsize=14)
axes[2].set_xlabel('Financial Year', fontsize=12)
axes[2].set_ylabel('Average Percentage (%)', fontsize=12)
axes[2].tick_params(axis='x', rotation=45)
axes[2].grid(axis='y', linestyle='--', alpha=0.7)
for x, y in zip(nagpur_yearly_trends.index, nagpur_yearly_trends['Avg_Payment_Efficiency']):
axes[2].annotate(f'{y:.1f}%', (x, y), textcoords="offset points", xytext=(0,10), ha='center', fontsize=8, color='brown')
"""
Purpose: To monitor the efficiency of wage disbursement in Nagpur over financial years.
Insights: Timely payments are crucial for worker welfare. A high and stable percentage is ideal.
Interpretation: Observed efficiency trends are critical for understanding administrative performance in Nagpur. Any dips warrant investigation, as payment delays can severely impact the financial stability of workers.
"""
plt.tight_layout(rect=[0, 0.03, 1, 0.96])
plt.show()
# Monthly trends (considering all years for seasonality for Nagpur)
print("\n--- Monthly Trends for Nagpur (Aggregated Across All Years) ---")
nagpur_monthly_trends = df_nagpur.groupby('month_full_name').agg(
Total_Expenditure=('Total_Exp', 'sum'),
Total_Workers=('Total_No_of_Workers', 'sum'),
Avg_Daily_Wage=('Average_Wage_rate_per_day_per_person', 'mean'),
Avg_100_Days_HH_Ratio=('100_Days_HH_Ratio', 'mean'),
Avg_Payment_Efficiency=('percentage_payments_gererated_within_15_days', 'mean')
).reindex(FINANCIAL_MONTH_ORDER) # Order months for consistent plotting
nagpur_monthly_trends = nagpur_monthly_trends.fillna(0) # Fill NaNs for plotting
print("\nNagpur Monthly Trends (Aggregated Across All Years):")
print(nagpur_monthly_trends)
fig, ax1 = plt.subplots(figsize=(15, 8))
color = 'tab:blue'
ax1.set_xlabel('Month', fontsize=12)
ax1.set_ylabel('Total Expenditure (Crores ₹)', color=color, fontsize=12)
line1_monthly = ax1.plot(nagpur_monthly_trends.index, nagpur_monthly_trends['Total_Expenditure'] / 1e7, color=color, marker='o', label='Total Expenditure')
ax1.tick_params(axis='y', labelcolor=color)
ax1.tick_params(axis='x', rotation=45)
ax1.grid(axis='y', linestyle='--', alpha=0.7)
for x, y in zip(nagpur_monthly_trends.index, nagpur_monthly_trends['Total_Expenditure'] / 1e7):
ax1.annotate(f'{y:.1f} Cr', (x, y), textcoords="offset points", xytext=(0,10), ha='center', fontsize=8, color=color)
ax2 = ax1.twinx()
color = 'tab:orange'
ax2.set_ylabel('Total Workers (Millions)', color=color, fontsize=12)
line2_monthly = ax2.plot(nagpur_monthly_trends.index, nagpur_monthly_trends['Total_Workers'] / 1e6, color=color, marker='x', label='Total Workers')
ax2.tick_params(axis='y', labelcolor=color)
for x, y in zip(nagpur_monthly_trends.index, nagpur_monthly_trends['Total_Workers'] / 1e6):
ax2.annotate(f'{y:.1f} M', (x, y), textcoords="offset points", xytext=(0,-15), ha='center', fontsize=8, color=color)
fig.suptitle('Nagpur Monthly Trends: Total Expenditure and Workers Employed', fontsize=16)
lines_combined_monthly = line1_monthly + line2_monthly
labels_combined_monthly = [l.get_label() for l in lines_combined_monthly]
ax2.legend(lines_combined_monthly, labels_combined_monthly, loc='upper left', bbox_to_anchor=(0.05, 0.95))
fig.tight_layout()
plt.show()
"""
Purpose: To identify seasonal patterns in MGNREGA expenditure and worker engagement for Nagpur district, aligned with the Indian financial year.
Insights: Reveals peak and lean periods of activity within the financial year specific to Nagpur.
Interpretation: Activity often aligns with agricultural cycles or local demand. Understanding these cycles is crucial for efficient resource planning. Annotated values provide precise figures.
"""
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
fig.suptitle('Nagpur Monthly Trends: Average Wages, 100-Days HH Ratio, and Payment Efficiency', fontsize=16)
# Plot Avg_Daily_Wage by Month
sns.lineplot(x=nagpur_monthly_trends.index, y='Avg_Daily_Wage', data=nagpur_monthly_trends, marker='o', ax=axes[0], color='green')
axes[0].set_title('Average Daily Wage Rate by Month', fontsize=14)
axes[0].set_xlabel('Month', fontsize=12)
axes[0].set_ylabel('Average Daily Wage (₹)', fontsize=12)
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(axis='y', linestyle='--', alpha=0.7)
for x, y in zip(nagpur_monthly_trends.index, nagpur_monthly_trends['Avg_Daily_Wage']):
axes[0].annotate(f'{y:.0f}', (x, y), textcoords="offset points", xytext=(0,10), ha='center', fontsize=8, color='green')
"""
Purpose: To analyze monthly variations in average daily wage rates for Nagpur district.
Insights: Can show if wages fluctuate seasonally or are relatively stable locally.
Interpretation: Significant monthly fluctuations might indicate changes in local work types or economic conditions. Relative stability suggests consistent wage policy application. Annotated values provide precise figures.
"""
# Plot Avg_100_Days_HH_Ratio by Month
sns.lineplot(x=nagpur_monthly_trends.index, y='Avg_100_Days_HH_Ratio', data=nagpur_monthly_trends, marker='o', ax=axes[1], color='purple')
axes[1].set_title('Average 100-Days HH Ratio by Month', fontsize=14)
axes[1].set_xlabel('Month', fontsize=12)
axes[1].set_ylabel('Average Ratio', fontsize=12)
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(axis='y', linestyle='--', alpha=0.7)
for x, y in zip(nagpur_monthly_trends.index, nagpur_monthly_trends['Avg_100_Days_HH_Ratio']):
axes[1].annotate(f'{y:.3f}', (x, y), textcoords="offset points", xytext=(0,10), ha='center', fontsize=8, color='purple')
"""
Purpose: To identify monthly patterns in the provision of 100 days of employment for Nagpur district.
Insights: Reveals if certain months are better or worse for achieving this key scheme objective locally.
Interpretation: Seasonal peaks might align with lean agricultural seasons when demand for work is highest. Understanding these patterns helps local planning. Annotated values provide precise figures.
"""
# Plot Avg_Payment_Efficiency by Month
sns.lineplot(x=nagpur_monthly_trends.index, y='Avg_Payment_Efficiency', data=nagpur_monthly_trends, marker='o', ax=axes[2], color='brown')
axes[2].set_title('Average Payment Efficiency by Month', fontsize=14)
axes[2].set_xlabel('Month', fontsize=12)
axes[2].set_ylabel('Average Percentage (%)', fontsize=12)
axes[2].tick_params(axis='x', rotation=45)
axes[2].grid(axis='y', linestyle='--', alpha=0.7)
for x, y in zip(nagpur_monthly_trends.index, nagpur_monthly_trends['Avg_Payment_Efficiency']):
axes[2].annotate(f'{y:.1f}%', (x, y), textcoords="offset points", xytext=(0,10), ha='center', fontsize=8, color='brown')
"""
Purpose: To assess monthly consistency in payment efficiency for Nagpur district.
Insights: Can highlight months where administrative bottlenecks or external factors might cause payment delays locally.
Interpretation: Stable high efficiency indicates a robust local system. Dips can signal areas where the payment mechanism needs strengthening. Annotated values provide precise figures.
"""
plt.tight_layout(rect=[0, 0.03, 1, 0.96])
plt.show()
# --- Step 5: Monthly Trends for Key Metrics by Financial Year (for Nagpur) ---
print("\n--- Step 5: Monthly Trends by Financial Year for Nagpur District ---")
# Aggregate monthly data by financial year and month for Nagpur
nagpur_monthly_yearly_trends = df_nagpur.groupby(['fin_year', 'month_full_name']).agg(
Total_Expenditure=('Total_Exp', 'sum'),
Total_Workers=('Total_No_of_Workers', 'sum'),
Avg_Daily_Wage=('Average_Wage_rate_per_day_per_person', 'mean'),
Avg_100_Days_HH_Ratio=('100_Days_HH_Ratio', 'mean'),
Avg_Payment_Efficiency=('percentage_payments_gererated_within_15_days', 'mean')
).reset_index().rename(columns={'month_full_name': 'Month'})
# Fill NaNs only for numerical columns in this aggregated DataFrame
numerical_cols_for_fill = ['Total_Expenditure', 'Total_Workers', 'Avg_Daily_Wage', 'Avg_100_Days_HH_Ratio', 'Avg_Payment_Efficiency']
for col in numerical_cols_for_fill:
if col in nagpur_monthly_yearly_trends.columns:
nagpur_monthly_yearly_trends[col] = nagpur_monthly_yearly_trends[col].fillna(0)
# Plotting Total Expenditure by Month for Each Year in Nagpur
plt.figure(figsize=(18, 10))
lineplot_exp_yearly = sns.lineplot(data=nagpur_monthly_yearly_trends, x='Month', y='Total_Expenditure', hue='fin_year', marker='o', palette='tab10')
plt.title('Nagpur Monthly Total Expenditure (Crores ₹) by Financial Year', fontsize=16)
plt.xlabel('Month', fontsize=12)
plt.ylabel('Total Expenditure (Crores ₹)', fontsize=12)
plt.tick_params(axis='x', rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.legend(title='Financial Year', bbox_to_anchor=(1.05, 1), loc='upper left')
for line in lineplot_exp_yearly.get_lines():
for x_val, y_val in zip(line.get_xdata(), line.get_ydata()):
if pd.notna(y_val) and y_val != 0:
plt.text(x_val, y_val, f'{y_val/1e7:.1f} Cr', fontsize=7, ha='center', va='bottom', color=line.get_color())
plt.tight_layout()
plt.show()
"""
Purpose: To visualize the month-wise trends of total expenditure for each financial year in Nagpur.
Insights: Helps in identifying year-specific anomalies or deviations from typical seasonal patterns locally.
Interpretation: Reveals if program scale increased in certain months in specific years for Nagpur. This granular view helps in understanding yearly operational nuances.
"""
# Plotting Total Workers by Month for Each Year in Nagpur
plt.figure(figsize=(18, 10))
lineplot_workers_yearly = sns.lineplot(data=nagpur_monthly_yearly_trends, x='Month', y='Total_Workers', hue='fin_year', marker='x', palette='tab10')
plt.title('Nagpur Monthly Total Workers (Millions) by Financial Year', fontsize=16)
plt.xlabel('Month', fontsize=12)
plt.ylabel('Total Workers (Millions)', fontsize=12)
plt.tick_params(axis='x', rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.legend(title='Financial Year', bbox_to_anchor=(1.05, 1), loc='upper left')
for line in lineplot_workers_yearly.get_lines():
for x_val, y_val in zip(line.get_xdata(), line.get_ydata()):
if pd.notna(y_val) and y_val != 0:
plt.text(x_val, y_val, f'{y_val/1e6:.1f} M', fontsize=7, ha='center', va='bottom', color=line.get_color())
plt.tight_layout()
plt.show()
"""
Purpose: To visualize the month-wise trends of total workers employed for each financial year in Nagpur.
Insights: Helps in identifying seasonal employment patterns and how they might vary year-on-year locally.
Interpretation: Shows how demand for work or availability of projects fluctuates monthly within each financial year. This is vital for understanding employment stability and planning resource allocation for Nagpur.
"""
# --- Step 6: Final Thoughts and Next Steps for Nagpur Analysis ---
print("\n--- Nagpur District MGNREGA Analysis: Focused Trends and Patterns Completed. ---")
print("This focused analysis provides detailed insights into MGNREGA implementation within Nagpur district.")
print("The temporal trends and insights derived here can inform local policy decisions.")
--- Starting Focused Analysis of Nagpur District ---
--- Step 1: Loading Data and Filtering for Nagpur District ---
Loaded filtered data from: Datasets\Cleaned_Preprocessed\mgnrega_data_filtered_for_analysis.csv
Initial DataFrame shape: (50892, 40)
Filtered data for Nagpur district. Shape: (72, 40)
Financial years included for Nagpur: ['2018-2019' '2019-2020' '2020-2021' '2021-2022' '2022-2023' '2023-2024']
--- Step 2: Feature Engineering and Temporal Key Derivation for Nagpur ---
Feature engineering and temporal keys successfully derived for Nagpur data.
--- Step 3: Initial Data Overview for Nagpur ---
<class 'pandas.core.frame.DataFrame'>
Index: 72 entries, 318 to 50491
Data columns (total 43 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 fin_year 72 non-null category
1 month 72 non-null object
2 state_code 72 non-null int64
3 State 72 non-null object
4 district_code 72 non-null int64
5 District 72 non-null object
6 Approved_Labour_Budget 72 non-null float64
7 Average_Wage_rate_per_day_per_person 72 non-null float64
8 Average_days_of_employment_provided_per_Household 72 non-null float64
9 Differently_abled_persons_worked 72 non-null float64
10 Material_and_skilled_Wages 72 non-null float64
11 Number_of_Completed_Works 72 non-null float64
12 Number_of_GPs_with_NIL_exp 72 non-null float64
13 Number_of_Ongoing_Works 72 non-null float64
14 Persondays_of_Central_Liability_so_far 72 non-null float64
15 SC_persondays 72 non-null float64
16 SC_workers_against_active_workers 72 non-null float64
17 ST_persondays 72 non-null float64
18 ST_workers_against_active_workers 72 non-null float64
19 Total_Adm_Expenditure 72 non-null float64
20 Total_Exp 72 non-null float64
21 Total_Households_Worked 72 non-null float64
22 Total_Individuals_Worked 72 non-null float64
23 Total_No_of_Active_Job_Cards 72 non-null float64
24 Total_No_of_Active_Workers 72 non-null float64
25 Total_No_of_HHs_completed_100_Days_of_Wage_Employment 72 non-null float64
26 Total_No_of_JobCards_issued 72 non-null float64
27 Total_No_of_Workers 72 non-null float64
28 Total_No_of_Works_Takenup 72 non-null float64
29 Wages 72 non-null float64
30 Women_Persondays 72 non-null float64
31 percent_of_Category_B_Works 72 non-null float64
32 percent_of_Expenditure_on_Agriculture_Allied_Works 72 non-null float64
33 percent_of_NRM_Expenditure 72 non-null float64
34 percentage_payments_gererated_within_15_days 72 non-null float64
35 Date 0 non-null float64
36 Women_Persondays_Ratio 72 non-null float64
37 SC_Persondays_Ratio 72 non-null float64
38 ST_Persondays_Ratio 72 non-null float64
39 100_Days_HH_Ratio 72 non-null float64
40 month_full_name 72 non-null category
41 month_num_fy_order 72 non-null int64
42 start_year_temp 72 non-null int64
dtypes: category(2), float64(34), int64(4), object(3)
memory usage: 24.4+ KB
None
--- Sample Data for Nagpur (Head) ---
fin_year month state_code State district_code District \
318 2018-2019 April 18 MAHARASHTRA 1827 NAGPUR
986 2018-2019 May 18 MAHARASHTRA 1827 NAGPUR
1654 2018-2019 June 18 MAHARASHTRA 1827 NAGPUR
2322 2018-2019 July 18 MAHARASHTRA 1827 NAGPUR
2990 2018-2019 Aug 18 MAHARASHTRA 1827 NAGPUR
Approved_Labour_Budget Average_Wage_rate_per_day_per_person \
318 376349.0 137.552109
986 776612.0 152.271672
1654 1084941.0 179.536980
2322 1311749.0 181.600118
2990 1487024.0 187.787268
Average_days_of_employment_provided_per_Household \
318 16.0
986 24.0
1654 29.0
2322 32.0
2990 35.0
Differently_abled_persons_worked ... percent_of_NRM_Expenditure \
318 67.0 ... 72.24
986 124.0 ... 69.05
1654 148.0 ... 70.52
2322 185.0 ... 70.99
2990 213.0 ... 71.29
percentage_payments_gererated_within_15_days Date \
318 99.99 NaN
986 100.00 NaN
1654 100.00 NaN
2322 100.00 NaN
2990 100.00 NaN
Women_Persondays_Ratio SC_Persondays_Ratio ST_Persondays_Ratio \
318 0.392035 0.156447 0.132818
986 0.389974 0.156660 0.125524
1654 0.382166 0.153166 0.127720
2322 0.368577 0.153426 0.131250
2990 0.358699 0.156639 0.132693
100_Days_HH_Ratio month_full_name month_num_fy_order start_year_temp
318 0.000603 April 0 2018
986 0.008550 May 1 2018
1654 0.016948 June 2 2018
2322 0.038761 July 3 2018
2990 0.055199 August 4 2018
[5 rows x 43 columns]
--- Descriptive Statistics for Numerical Features in Nagpur ---
count mean \
state_code 72.0 1.800000e+01
district_code 72.0 1.827000e+03
Approved_Labour_Budget 72.0 1.373898e+06
Average_Wage_rate_per_day_per_person 72.0 2.124429e+02
Average_days_of_employment_provided_per_Household 72.0 3.295833e+01
Differently_abled_persons_worked 72.0 2.764861e+02
Material_and_skilled_Wages 72.0 7.214092e+02
Number_of_Completed_Works 72.0 4.445111e+03
Number_of_GPs_with_NIL_exp 72.0 5.365278e+01
Number_of_Ongoing_Works 72.0 1.396217e+04
Persondays_of_Central_Liability_so_far 72.0 1.222986e+06
SC_persondays 72.0 1.904906e+05
SC_workers_against_active_workers 72.0 3.391543e+04
ST_persondays 72.0 1.507574e+05
ST_workers_against_active_workers 72.0 2.846496e+04
Total_Adm_Expenditure 72.0 1.468370e+02
Total_Exp 72.0 3.595353e+03
Total_Households_Worked 72.0 3.408544e+04
Total_Individuals_Worked 72.0 4.997926e+04
Total_No_of_Active_Job_Cards 72.0 9.019483e+04
Total_No_of_Active_Workers 72.0 2.153674e+05
Total_No_of_HHs_completed_100_Days_of_Wage_Empl... 72.0 2.277694e+03
Total_No_of_JobCards_issued 72.0 2.227571e+05
Total_No_of_Workers 72.0 4.583063e+05
Total_No_of_Works_Takenup 72.0 1.840728e+04
Wages 72.0 2.727106e+03
Women_Persondays 72.0 4.664067e+05
percent_of_Category_B_Works 72.0 7.859722e+01
percent_of_Expenditure_on_Agriculture_Allied_Works 72.0 4.876347e+01
percent_of_NRM_Expenditure 72.0 5.883681e+01
percentage_payments_gererated_within_15_days 72.0 9.999931e+01
Date 0.0 NaN
Women_Persondays_Ratio 72.0 3.832742e-01
SC_Persondays_Ratio 72.0 1.561983e-01
ST_Persondays_Ratio 72.0 1.244744e-01
100_Days_HH_Ratio 72.0 5.645201e-02
month_num_fy_order 72.0 5.500000e+00
start_year_temp 72.0 2.020500e+03
std \
state_code 0.000000
district_code 0.000000
Approved_Labour_Budget 603711.723964
Average_Wage_rate_per_day_per_person 33.930351
Average_days_of_employment_provided_per_Household 9.606977
Differently_abled_persons_worked 117.808748
Material_and_skilled_Wages 830.034158
Number_of_Completed_Works 2667.935497
Number_of_GPs_with_NIL_exp 65.200268
Number_of_Ongoing_Works 3759.731733
Persondays_of_Central_Liability_so_far 652872.083905
SC_persondays 102643.071462
SC_workers_against_active_workers 2133.696170
ST_persondays 78853.071908
ST_workers_against_active_workers 1445.390014
Total_Adm_Expenditure 111.045366
Total_Exp 2264.319404
Total_Households_Worked 13299.175412
Total_Individuals_Worked 20760.463694
Total_No_of_Active_Job_Cards 10605.214107
Total_No_of_Active_Workers 19535.791190
Total_No_of_HHs_completed_100_Days_of_Wage_Empl... 1854.711242
Total_No_of_JobCards_issued 34185.186851
Total_No_of_Workers 49805.979211
Total_No_of_Works_Takenup 4209.766844
Wages 1615.294918
Women_Persondays 257073.033443
percent_of_Category_B_Works 7.937537
percent_of_Expenditure_on_Agriculture_Allied_Works 14.845400
percent_of_NRM_Expenditure 27.010518
percentage_payments_gererated_within_15_days 0.002560
Date NaN
Women_Persondays_Ratio 0.040318
SC_Persondays_Ratio 0.018489
ST_Persondays_Ratio 0.008116
100_Days_HH_Ratio 0.039819
month_num_fy_order 3.476278
start_year_temp 1.719810
min \
state_code 18.000000
district_code 1827.000000
Approved_Labour_Budget 90077.000000
Average_Wage_rate_per_day_per_person 131.142447
Average_days_of_employment_provided_per_Household 13.000000
Differently_abled_persons_worked 50.000000
Material_and_skilled_Wages 0.000000
Number_of_Completed_Works 179.000000
Number_of_GPs_with_NIL_exp 14.000000
Number_of_Ongoing_Works 7650.000000
Persondays_of_Central_Liability_so_far 90077.000000
SC_persondays 11231.000000
SC_workers_against_active_workers 29546.000000
ST_persondays 13039.000000
ST_workers_against_active_workers 26146.000000
Total_Adm_Expenditure 0.000000
Total_Exp 187.120845
Total_Households_Worked 6630.000000
Total_Individuals_Worked 9063.000000
Total_No_of_Active_Job_Cards 73037.000000
Total_No_of_Active_Workers 183180.000000
Total_No_of_HHs_completed_100_Days_of_Wage_Empl... 0.000000
Total_No_of_JobCards_issued 179066.000000
Total_No_of_Workers 393213.000000
Total_No_of_Works_Takenup 9749.000000
Wages 142.192040
Women_Persondays 39053.000000
percent_of_Category_B_Works 62.000000
percent_of_Expenditure_on_Agriculture_Allied_Works 24.130000
percent_of_NRM_Expenditure 3.360000
percentage_payments_gererated_within_15_days 99.990000
Date NaN
Women_Persondays_Ratio 0.317074
SC_Persondays_Ratio 0.124682
ST_Persondays_Ratio 0.107061
100_Days_HH_Ratio 0.000000
month_num_fy_order 0.000000
start_year_temp 2018.000000
25% \
state_code 18.000000
district_code 1827.000000
Approved_Labour_Budget 897675.750000
Average_Wage_rate_per_day_per_person 190.709330
Average_days_of_employment_provided_per_Household 26.750000
Differently_abled_persons_worked 192.000000
Material_and_skilled_Wages 164.851711
Number_of_Completed_Works 2427.250000
Number_of_GPs_with_NIL_exp 22.750000
Number_of_Ongoing_Works 10391.750000
Persondays_of_Central_Liability_so_far 712994.250000
SC_persondays 104053.500000
SC_workers_against_active_workers 33009.000000
ST_persondays 85989.500000
ST_workers_against_active_workers 27282.750000
Total_Adm_Expenditure 48.348037
Total_Exp 1804.303002
Total_Households_Worked 24984.250000
Total_Individuals_Worked 35952.250000
Total_No_of_Active_Job_Cards 80011.000000
Total_No_of_Active_Workers 196453.500000
Total_No_of_HHs_completed_100_Days_of_Wage_Empl... 530.000000
Total_No_of_JobCards_issued 187843.750000
Total_No_of_Workers 407078.250000
Total_No_of_Works_Takenup 15072.250000
Wages 1505.586636
Women_Persondays 282847.250000
percent_of_Category_B_Works 74.000000
percent_of_Expenditure_on_Agriculture_Allied_Works 37.260000
percent_of_NRM_Expenditure 49.325000
percentage_payments_gererated_within_15_days 100.000000
Date NaN
Women_Persondays_Ratio 0.347655
SC_Persondays_Ratio 0.141932
ST_Persondays_Ratio 0.122248
100_Days_HH_Ratio 0.017813
month_num_fy_order 2.750000
start_year_temp 2019.000000
50% \
state_code 1.800000e+01
district_code 1.827000e+03
Approved_Labour_Budget 1.393116e+06
Average_Wage_rate_per_day_per_person 2.136952e+02
Average_days_of_employment_provided_per_Household 3.400000e+01
Differently_abled_persons_worked 2.770000e+02
Material_and_skilled_Wages 4.033109e+02
Number_of_Completed_Works 4.352500e+03
Number_of_GPs_with_NIL_exp 3.000000e+01
Number_of_Ongoing_Works 1.541100e+04
Persondays_of_Central_Liability_so_far 1.216616e+06
SC_persondays 1.876655e+05
SC_workers_against_active_workers 3.426800e+04
ST_persondays 1.493465e+05
ST_workers_against_active_workers 2.843550e+04
Total_Adm_Expenditure 1.466534e+02
Total_Exp 3.450885e+03
Total_Households_Worked 3.426150e+04
Total_Individuals_Worked 4.996600e+04
Total_No_of_Active_Job_Cards 9.171450e+04
Total_No_of_Active_Workers 2.197795e+05
Total_No_of_HHs_completed_100_Days_of_Wage_Empl... 2.034500e+03
Total_No_of_JobCards_issued 2.233170e+05
Total_No_of_Workers 4.613180e+05
Total_No_of_Works_Takenup 1.865650e+04
Wages 2.703616e+03
Women_Persondays 4.497790e+05
percent_of_Category_B_Works 8.200000e+01
percent_of_Expenditure_on_Agriculture_Allied_Works 4.520500e+01
percent_of_NRM_Expenditure 7.141000e+01
percentage_payments_gererated_within_15_days 1.000000e+02
Date NaN
Women_Persondays_Ratio 3.821299e-01
SC_Persondays_Ratio 1.533384e-01
ST_Persondays_Ratio 1.252367e-01
100_Days_HH_Ratio 5.666332e-02
month_num_fy_order 5.500000e+00
start_year_temp 2.020500e+03
75% max
state_code 1.800000e+01 1.800000e+01
district_code 1.827000e+03 1.827000e+03
Approved_Labour_Budget 1.803619e+06 2.594311e+06
Average_Wage_rate_per_day_per_person 2.347767e+02 2.981855e+02
Average_days_of_employment_provided_per_Household 3.900000e+01 5.000000e+01
Differently_abled_persons_worked 3.617500e+02 5.130000e+02
Material_and_skilled_Wages 8.139229e+02 3.319024e+03
Number_of_Completed_Works 6.496000e+03 1.011500e+04
Number_of_GPs_with_NIL_exp 4.400000e+01 4.290000e+02
Number_of_Ongoing_Works 1.700525e+04 2.048100e+04
Persondays_of_Central_Liability_so_far 1.689101e+06 2.517117e+06
SC_persondays 2.737548e+05 3.993640e+05
SC_workers_against_active_workers 3.583250e+04 3.637600e+04
ST_persondays 2.116965e+05 3.131570e+05
ST_workers_against_active_workers 2.954450e+04 3.090500e+04
Total_Adm_Expenditure 2.078986e+02 4.671982e+02
Total_Exp 5.110387e+03 9.303179e+03
Total_Households_Worked 4.230825e+04 6.182100e+04
Total_Individuals_Worked 6.295675e+04 9.507600e+04
Total_No_of_Active_Job_Cards 1.008132e+05 1.033300e+05
Total_No_of_Active_Workers 2.343192e+05 2.373780e+05
Total_No_of_HHs_completed_100_Days_of_Wage_Empl... 3.811500e+03 6.576000e+03
Total_No_of_JobCards_issued 2.549758e+05 2.760340e+05
Total_No_of_Workers 5.067615e+05 5.319190e+05
Total_No_of_Works_Takenup 2.171000e+04 2.617400e+04
Wages 3.907296e+03 6.536419e+03
Women_Persondays 6.479552e+05 1.080982e+06
percent_of_Category_B_Works 8.500000e+01 8.700000e+01
percent_of_Expenditure_on_Agriculture_Allied_Works 5.835750e+01 7.879000e+01
percent_of_NRM_Expenditure 7.670750e+01 8.639000e+01
percentage_payments_gererated_within_15_days 1.000000e+02 1.000000e+02
Date NaN NaN
Women_Persondays_Ratio 4.140111e-01 4.692050e-01
SC_Persondays_Ratio 1.630996e-01 1.984556e-01
ST_Persondays_Ratio 1.286752e-01 1.447539e-01
100_Days_HH_Ratio 8.629088e-02 1.307318e-01
month_num_fy_order 8.250000e+00 1.100000e+01
start_year_temp 2.022000e+03 2.023000e+03
--- Count of Zero Values in Key Numerical Columns for Nagpur ---
'100_Days_HH_Ratio': 1 zeros (1.39%)
--- Step 4: Analyzing Temporal Trends for Nagpur District ---
Nagpur Yearly Trends (Overall):
Total_Expenditure Total_Workers Avg_Daily_Wage \
fin_year
2018-2019 36003.984988 4745290.0 181.600677
2019-2020 28644.668042 4883713.0 189.068795
2020-2021 39765.555917 5215388.0 205.880322
2021-2022 44870.125487 5794989.0 228.033867
2022-2023 62695.221362 6082520.0 227.957119
2023-2024 46885.836807 6276155.0 242.116466
Avg_100_Days_HH_Ratio Avg_Payment_Efficiency
fin_year
2018-2019 0.066915 99.999167
2019-2020 0.064969 100.000000
2020-2021 0.070508 99.996667
2021-2022 0.051067 100.000000
2022-2023 0.046105 100.000000
2023-2024 0.039148 100.000000
--- Monthly Trends for Nagpur (Aggregated Across All Years) ---
Nagpur Monthly Trends (Aggregated Across All Years):
Total_Expenditure Total_Workers Avg_Daily_Wage \
month_full_name
April 1797.115563 2693376.0 153.728563
May 5381.415204 2703539.0 173.545886
June 9404.866004 2712227.0 200.540204
July 13595.653277 2721680.0 214.687457
August 17025.225160 2726478.0 220.551309
September 19753.344536 2741670.0 221.886354
October 24116.570366 2756815.0 223.605294
November 26908.565972 2766415.0 224.058869
December 30116.859184 2777625.0 224.576597
January 32986.434230 2790593.0 224.978947
February 35826.356791 2799678.0 223.039579
March 41952.986318 2807959.0 244.115437
Avg_100_Days_HH_Ratio Avg_Payment_Efficiency
month_full_name
April 0.000379 99.998333
May 0.004609 100.000000
June 0.012329 100.000000
July 0.029970 99.998333
August 0.043605 99.998333
September 0.053140 99.998333
October 0.065740 99.998333
November 0.076896 100.000000
December 0.085949 100.000000
January 0.093965 100.000000
February 0.101236 100.000000
March 0.109607 100.000000
--- Step 5: Monthly Trends by Financial Year for Nagpur District ---
--- Nagpur District MGNREGA Analysis: Focused Trends and Patterns Completed. --- This focused analysis provides detailed insights into MGNREGA implementation within Nagpur district. The temporal trends and insights derived here can inform local policy decisions.
In [ ]: